In [None]:
from pagescrapping.wayland import WaylandGamesScraper
import asyncio

scraper = WaylandGamesScraper()

# Set the actual URL
start_url = "https://www.waylandgames.co.uk"  # Replace with the real URL of the target page

# Run the scraper asynchronously
asyncio.run(scraper.run(start_url))

# Save the data to a JSON file
scraper.save_data('scraped_data.json')

# Optionally, display the scraped data
scraped_data = scraper.get_data()
scraped_data  # This will display the scraped data in the notebook


In [None]:
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import asyncio
import nest_asyncio
import json
import logging

class WebScraper:
    def __init__(self, start_url, link_selector, subsequent_link_selector, max_initial_links):
        self.start_url = start_url
        self.link_selector = link_selector  
        self.subsequent_link_selector = subsequent_link_selector  
        self.max_initial_links = max_initial_links
        self.visited_urls = set()
        self.final_links = set()
        self.stack = []  # Stack to manage DFS

    async def scrape(self):
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            try:
                await self._scrape_page(page, self.start_url)  # Start scraping from the main page
            except Exception as e:
                logging.error(f"Error during scraping: {e}")
            finally:
                await browser.close()
                self.save_to_json('tmp/final_links.json')

    async def _scrape_page(self, page, url):
        self.stack.append(url)
        initial_links_count = 0
        is_first_page = True
        
        while self.stack:
            current_url = self.stack.pop()

            if current_url in self.visited_urls:
                continue

            try:
                logging.info(f"Navigating to {current_url}")
                await page.goto(current_url, timeout=60000)

                if is_first_page:
                    logging.info(f"Waiting for initial selector {self.link_selector}")
                    await page.wait_for_selector(self.link_selector, timeout=20000)
                    
                    
                    links = await page.query_selector_all(f"{self.link_selector} a")
                    logging.info(f"Found {len(links)} links on {current_url}")

                    
                    links = links[:self.max_initial_links]
                    is_first_page = False  

                    for link in links:
                        href = await link.get_attribute('href')
                        if href:
                            full_url = urljoin(self.start_url, href)
                            if full_url not in self.visited_urls:
                                self.stack.append(full_url)
                                logging.info(f"Added initial link to stack: {full_url}")
                else:
                    logging.info(f"Waiting for subsequent selector {self.subsequent_link_selector}")
                    await page.wait_for_selector(self.subsequent_link_selector, timeout=20000)

                    links = await page.query_selector_all(f"{self.subsequent_link_selector} a")
                    logging.info(f"Found {len(links)} subsequent links on {current_url}")

                    if not links:
                        logging.info(f"No more links on {current_url}, adding to final links")
                        self.final_links.add(current_url)
                        self.visited_urls.add(current_url)
                    else:
                        for link in links:
                            href = await link.get_attribute('href')
                            if href:
                                full_url = urljoin(self.start_url, href)
                                if full_url not in self.visited_urls:
                                    self.final_links.add(full_url)
                                    logging.info(f"retrieved link: {full_url}")

                logging.info(f"cxurrent stack size: {len(self.stack)}")

            except Exception as e:
                logging.error(f"an error occurred while scraping {current_url}: {e}")

    def save_to_json(self, filename):
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(list(self.final_links), f, ensure_ascii=False, indent=4)
        logging.info(f"Data saved to {filename}")

start_url = "https://elementgames.co.uk/"
link_selector = "body > nav > div:nth-child(3) > div > div > div > ul" 
subsequent_link_selector = "body > div.section-colored.text-center > div > div:nth-child(2)"
max_initial_links = 1 

scraper = WebScraper(start_url, link_selector, subsequent_link_selector, max_initial_links)
asyncio.run(scraper.scrape())


In [None]:
import json
import asyncio
from playwright.async_api import async_playwright
import pandas as pd
import nest_asyncio
import logging
from read import pd_to_file, pd_read_file

nest_asyncio.apply()

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

async def retrieve_product_info(json_file, output_file):
    async with async_playwright() as p:
        browser = await p.chromium.launch(headless=True)
        page = await browser.new_page()
        
        # Load the URLs from the JSON file
        with open(json_file, 'r', encoding='utf-8') as f:
            urls = json.load(f)

        results = []

        for url in urls:
            retries = 3
            while retries > 0:
                try:
                    logging.info(f"Navigating to {url}")
                    await page.goto(url, timeout=20000)

    
                    title_element = await page.query_selector("body > div.section-colored.text-center > div > div.product-wrap.row > div.product-info.col-xs-12.col-sm-12.col-md-7.col-lg-7 > div > div > h1")
                    title = await title_element.inner_text() if title_element else "Title not found"

                   
                    price_element = await page.query_selector("#testproduct > div > div.price-wrap > span.currentPrice")
                    price = await price_element.inner_text() if price_element else "price not found"
                    results.append({"url": url, "product_title": title, "price": price})
                    logging.info(f"Retrieved: {title} - {price}")
                    break

                except Exception as e:
                    retries -= 1
                    logging.error(f"Error retrieving information from {url}: {e}. Retries left: {retries}")
                    if retries == 0:
                        results.append({"url": url, "product_title": "Error", "price": "Error"})

        await browser.close()

        df = pd.DataFrame(results)
        pd_to_file(df, output_file, index=False)
        logging.info(f"Data saved to {output_file}")

if __name__ == "__main__":
    json_input_file = 'tmp/elementalgame.json'
    output_csv_file = 'tmp/elementgame.csv'
    asyncio.run(retrieve_product_info(json_input_file, output_csv_file))

In [None]:
from read import pd_to_file, pd_read_file
df = pd_read_file('tmp/elementgame.csv', sep="\t")
df

In [1]:
######################argos 
from urllib.parse import urljoin
from playwright.async_api import async_playwright
import asyncio
import nest_asyncio
import json
import logging

class WebScraper:
    def __init__(self, start_url, sel1, sel2, sel3, max_links):
        self.start_url = start_url
        self.sel1 = sel1  # First selector
        self.sel2 = sel2  # Second selector
        self.sel3 = sel3  # Third selector
        self.max_links = max_links
        self.visited = set()
        self.final_links = set()
        self.stack = []  # Stack to manage DFS

    async def scrape(self):
        async with async_playwright() as p:
            browser = await p.chromium.launch(headless=True)
            page = await browser.new_page()
            try:
                await self._scrape_page(page, self.start_url)  # Start from the main page
            except Exception as e:
                logging.error(f"Scrape error: {e}")
            finally:
                await browser.close()
                self.save_to_json('tmp/final_links.json')

    async def _scrape_page(self, page, url):
        self.stack.append(url)
        is_first_page = True
        
        while self.stack:
            curr_url = self.stack.pop()

            if curr_url in self.visited:
                continue

            try:
                logging.info(f"Navigating to {curr_url}")
                await page.goto(curr_url, timeout=60000)

                # For the first page
                if is_first_page:
                    logging.info(f"Waiting for initial selector {self.sel1}")
                    await page.wait_for_selector(self.sel1, timeout=60000)
                    
                    links = await page.query_selector_all(f"{self.sel1} a")
                    logging.info(f"Found {len(links)} links on {curr_url}")
                    
                    links = links[:self.max_links]  # Limit initial links
                    is_first_page = False  # Toggle off first page check
                    
                    for link in links:
                        href = await link.get_attribute('href')
                        if href:
                            full_url = urljoin(self.start_url, href)
                            if full_url not in self.visited:
                                self.stack.append(full_url)
                                logging.info(f"Added link to stack: {full_url}")
                
                # For the second page (based on sel2)
                else:
                    logging.info(f"Waiting for second selector {self.sel2}")
                    await page.wait_for_selector(self.sel2, timeout=20000)

                    links = await page.query_selector_all(f"{self.sel2} a")
                    logging.info(f"Found {len(links)} subsequent links on {curr_url}")

                    for link in links:
                        href = await link.get_attribute('href')
                        if href:
                            full_url = urljoin(self.start_url, href)
                            if full_url not in self.visited:
                                logging.info(f"Clicking through second-level link: {full_url}")
                                await self._scrape_deeper(page, full_url)
                    
            except Exception as e:
                logging.error(f"Scrape error at {curr_url}: {e}")

    async def _scrape_deeper(self, page, url):
        try:
            logging.info(f"Navigating deeper to {url}")
            await page.goto(url, timeout=60000)
            
            logging.info(f"Waiting for third selector {self.sel3}")
            await page.wait_for_selector(self.sel3, timeout=20000)
            
            links = await page.query_selector_all(f"{self.sel3} a")
            logging.info(f"Found {len(links)} final-layer links on {url}")

            for link in links:
                href = await link.get_attribute('href')
                if href:
                    full_url = urljoin(self.start_url, href)
                    if full_url not in self.visited:
                        self.final_links.add(full_url)
                        logging.info(f"Final link added: {full_url}")
            
        except Exception as e:
            logging.error(f"Error in deep scrape at {url}: {e}")

    def save_to_json(self, filename):
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(list(self.final_links), f, ensure_ascii=False, indent=4)
        logging.info(f"Data saved to {filename}")

# Usage
start_url = "https://www.argos.co.uk/"
sel1 = "#main-content > section.md\:ds-mx-\[calc\(50\%-49vw\)\] > div > div.M052styles__Container-sc-1cubg5c-2.leIYqs > div > div"  # First layer selector
sel2 = "#findability > div.browse > div > div.xs-12.lg-9 > div.browse__content-area > div:nth-child(4) > div"  # Second layer selector
sel3 = "#findability > div.search > div > div.styles__Container-sc-1h5mbdb-0.bCQCYQ.xs-12--none.lg-9--none.xs-stack > div:nth-child(8) > div > div"  # Third layer selector (modify according to the site structure)
max_links = 1  # Maximum number of links to scrape per layer

scraper = WebScraper(start_url, sel1, sel2, sel3, max_links)
asyncio.run(scraper.scrape())


RuntimeError: asyncio.run() cannot be called from a running event loop