In [1]:
%load_ext autoreload
%autoreload 2
%load_ext dotenv
%dotenv

In [2]:
import openai
from playwright.async_api import async_playwright

from aiselectors.aiselectors import AISelectors
from aiselectors.validators import require_href, require_unique, require_unique_href

## Start the playwright browser and open the boardgamegeek website

In [3]:
cache_path = '../cache.json'

In [4]:
client = openai.OpenAI()
playwright = await async_playwright().start()

In [5]:
ais = AISelectors(client)

In [6]:
browser = await playwright.chromium.launch(headless=False)

In [7]:
page = await browser.new_page()

In [8]:
url = "https://boardgamegeek.com/"
await page.goto(url)

<Response url='https://boardgamegeek.com/' request=<Request url='https://boardgamegeek.com/' method='GET'>>

In [9]:
html = await page.content()
len(html)

354517

In [10]:
aipage = ais.page(url, html)
len(aipage.cleaned_html)

99193

## Navigate to the list of board games

In [13]:
xpath = await aipage.get_xpath('browse menu', [require_unique], verbose=True)
print(xpath)
await page.click('xpath='+xpath)
await page.wait_for_load_state()

responses [{'selector': '.dropdown-toggle.btn'}, {'selector': ".c-nav-primary .dropdown-primary .dropdown-menu-lg .dropdown-menu .dropdown-item a[href='/browse/boardgame']"}, {'selector': '.dropdown-toggle.btn'}, {'xpath': "//button[contains(text(), 'Browse')]"}, {'xpath': "//button[contains(text(), 'Browse')]"}, {'xpath': "//button[contains(text(), 'Browse')]"}, {'text': ['All Boardgames', 'Categories', 'Artists', 'Publishers', 'Honors', 'Gone Cardboard', 'Recent Additions', 'Previews', 'Families', 'Mechanics', 'Designers', 'Accessories', 'Random Game', 'Podcasts', 'Wiki']}, {'text': ['All Boardgames', 'Categories', 'Artists', 'Publishers', 'Honors', 'Gone Cardboard', 'Recent Additions', 'Previews', 'Families', 'Mechanics', 'Designers', 'Accessories', 'Random Game', 'Podcasts', 'Wiki']}, {'text': ['All Boardgames', 'Categories', 'Artists', 'Publishers', 'Honors', 'Gone Cardboard', 'Recent Additions', 'Previews', 'Families', 'Mechanics', 'Designers', 'Accessories', 'Random Game', 'Podc

In [14]:
xpath = await aipage.get_xpath('all boardgames link', [require_unique_href], verbose=True)
print(xpath)
await page.click('xpath='+xpath)
await page.wait_for_load_state()

responses [{'selector': "a[href='/browse/boardgame']"}, {'selector': "a[href='/browse/boardgame']"}, {'selector': "a[href='/browse/boardgame']"}, {'xpath': "//a[@href='/browse/boardgame']"}, {'xpath': "//a[@href='/browse/boardgame']"}, {'xpath': "//a[@href='/browse/boardgame']"}, {'text': '/browse/boardgame'}, {'text': '/browse/boardgame'}, {'text': '/browse/boardgame'}]
selector descendant-or-self::a[@href = '/browse/boardgame']
selector descendant-or-self::a[@href = '/browse/boardgame']
selector descendant-or-self::a[@href = '/browse/boardgame']
xpath //a[@href='/browse/boardgame']
xpath //a[@href='/browse/boardgame']
xpath //a[@href='/browse/boardgame']
text /browse/boardgame
text /browse/boardgame
text /browse/boardgame
valid: //a[@href='/browse/boardgame']
valid: descendant-or-self::a[@href = '/browse/boardgame']
xpath_text_matches Counter({"//a[@href='/browse/boardgame']": 1, "descendant-or-self::a[@href = '/browse/boardgame']": 1})
//a[@href='/browse/boardgame']


## Get the links to individual board games

In [15]:
html = await page.content()
aipage = ais.page(page.url, html)
print(page.url, len(html), len(aipage.cleaned_html))
xpath = await aipage.get_xpath('links to game titles in the board game ranking list', [require_href], verbose=True)
print(xpath)

https://boardgamegeek.com/browse/boardgame
JSONDecodeError {
  "text": [
    "Brass: Birmingham",
    "Pandemic Legacy: Season 1",
    "Gloomhaven",
    "Ark Nova",
    "Twilight Imperium: Fourth Edition",
    "Dune: Imperium",
    "Terraforming Mars",
    "War of the Ring: Second Edition",
    "Gloomhaven: Jaws of the Lion",
    "Star Wars: Rebellion",
    "Spirit Island",
    "Gaia Project",
    "Twilight Struggle",
    "Through the Ages: A New Story of Civilization",
    "Great Western Trail",
    "The Castles of Burgundy",
    "Scythe",
    "Eclipse: Second Dawn for the Galaxy",
    "7 Wonders Duel",
    "Brass: Lancashire",
    "Nemesis",
    "Clank! Legacy: Acquisitions Incorporated",
    "Concordia",
    "A Feast for Odin",
    "Wingspan",
    "Arkham Horror: The Card Game",
    "Lost Ruins of Arnak",
    "Terra Mystica",
    "Great Western Trail: Second Edition",
    "Root",
    "Frosthaven",
    "Orléans",
    "Too Many Bones",
    "Mage Knight Board Game",
    "The Crew: Miss

## Navigate to each page and get the title and average rating

In [17]:
games_count = len(await page.locator("xpath="+xpath).element_handles())
print(games_count)

100


In [19]:
for i in range(games_count):
    if i > 5:
        break
    elms = await page.locator("xpath="+xpath).element_handles()
    await elms[i].click()
    await page.wait_for_load_state()
    html = await page.content()
    game_page = ais.page(page.url, html)
    title_xpath = await game_page.get_xpath('game title', [require_unique], verbose=False)
    print('title_xpath', title_xpath)
    title = await page.locator('xpath='+title_xpath).text_content()
    print('title', title)
    rating_xpath = await game_page.get_xpath('average rating', [require_unique], verbose=False)
    print('rating_xpath', rating_xpath)
    rating = await page.locator('xpath='+rating_xpath).text_content()
    print('rating', rating)
    await page.go_back()
    # await page.wait_for_load_state()

title_xpath //div[@class='game-header-title-info']/h1/a/span
title Brass: Birmingham
rating_xpath /html/body/div[2]/main/div[2]/div/div[1]/div[2]/ng-include/div/ng-include/div/div[2]/div[2]/div[1]/div/div[1]/overall-rating/div/div/a/span[1]
rating  					8.6 				
title_xpath //div[@class='game-header-title-info']/h1/a/span
title Pandemic Legacy: Season 1
rating_xpath /html/body/div[2]/main/div[2]/div/div[1]/div[2]/ng-include/div/ng-include/div/div[2]/div[2]/div[1]/div/div[1]/overall-rating/div/div/a/span[1]
rating  					8.5 				
title_xpath //div[@class='game-header-title-info']/h1/a/span
title Gloomhaven
rating_xpath /html/body/div[2]/main/div[2]/div/div[1]/div[2]/ng-include/div/ng-include/div/div[2]/div[2]/div[1]/div/div[1]/overall-rating/div/div/a/span[1]
rating  					8.6 				
title_xpath //div[@class='game-header-title-info']/h1/a/span
title Ark Nova
rating_xpath /html/body/div[2]/main/div[2]/div/div[1]/div[2]/ng-include/div/ng-include/div/div[2]/div[2]/div[1]/div/div[1]/overall-rat

## Show the cache

The cache maps prompts to the xpaths that were found for the prompts. 
It can be saved and re-loaded later to avoid having to re-evaluate the prompts in future runs.
In future runs, if a cached xpath doesn't match anything, the prompt will be re-evaluated and the cache updated with the new xpath.

In [20]:
ais._cache

{'boardgamegeek.com|browse menu': "//button[contains(text(), 'Browse')]",
 'boardgamegeek.com|all boardgames link': "//a[@href='/browse/boardgame']",
 'boardgamegeek.com|links to game titles in the board game ranking list': '/html/body/div[2]/main/div[2]/div/div[1]/div/div/div[2]/div[3]/table/tbody/tr/td[3]/div[2]/a',
 'boardgamegeek.com|game title': "//div[@class='game-header-title-info']/h1/a/span",
 'boardgamegeek.com|average rating': '/html/body/div[2]/main/div[2]/div/div[1]/div[2]/ng-include/div/ng-include/div/div[2]/div[2]/div[1]/div/div[1]/overall-rating/div/div/a/span[1]'}

In [21]:
ais.save_cache(cache_path)

## Clean up

In [98]:
await browser.close()
await playwright.stop()

## misc test cells

In [None]:
from lxml import html as lxml_html
from lxml import etree

xpath = "/html/body/div[2]/main/div[2]/div/div[1]/div/div/div[2]/div[3]/table/tbody/tr/td[3]/div[2]/a"
tree = lxml_html.fromstring(aipage.html)
elms = tree.xpath(xpath)
print(len(elms))
for ix, elm in enumerate(elms):
    print(etree.tostring(elm))
    print('\n')
    if ix == 10:
        break
