#### Web Parser (1 step)

Some configuration.

In [1]:
import yaml, os

with open("config/config.yaml", 'r') as stream:
    config = yaml.safe_load(stream)

os.environ["OPENAI_API_KEY"] = config['OPENAI_API_KEY']
USER_DATA_DIR = config['USER_DATA_DIR']
MAIN_URL = config['MAIN_URL']
MODEL_NAME = config['MODEL_NAME']
TEMPERATURE = config['TEMPERATURE']
WEBSITE_USERNAME = config['WEBSITE_USERNAME']
WEBSITE_PASSWORD = config['WEBSITE_PASSWORD']

Start webdriver with main url. Then finish the login process automatically.

In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys

options = webdriver.ChromeOptions()
if USER_DATA_DIR:
    options.add_argument(f"user-data-dir={USER_DATA_DIR}")
    # options.add_argument("--headless")

driver = webdriver.Chrome(chrome_options=options)
driver.maximize_window()
driver.get(MAIN_URL)
driver.implicitly_wait(2)

driver.find_element(By.CSS_SELECTOR, 'a.nav-link.login.w-nav-link').click()
driver.find_element(By.CSS_SELECTOR, "input[name='email']").clear()
driver.find_element(By.CSS_SELECTOR, "input[name='email']").send_keys(WEBSITE_USERNAME)
driver.find_element(By.CSS_SELECTOR, "input[name='password']").send_keys(WEBSITE_PASSWORD)
driver.find_element(By.CSS_SELECTOR, 'button.btn.btn-success').click()

  driver = webdriver.Chrome(chrome_options=options)


A html cleaning function. Used for every step.

In [14]:
from bs4 import BeautifulSoup

def html_cleaner(html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    elements_with_style = soup.select('[style]')
    for element in elements_with_style:
        del element['style']
    
    for tag in soup(['script', 'style', 'noscript', 'circle', 'meta', 'path']):
        tag.extract()

    # for div in soup.find_all('div'):
    #     del div['tabindex', 'data-test', 'aria-hidden', 'role']

    clean_html_content = soup.prettify()
    with open('data/html_content.html', 'w') as f:
        f.write(str(clean_html_content))
        
    return clean_html_content

Token counting function. Used for segmentation.

In [15]:
import tiktoken

def num_tokens_from_string(string: str, model_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(model_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

Using LLMChain with this prompt. Notice that this prompt still needs more refinement, and we add some instructions for instantly web HTML specifically.

In [16]:
WEBPAGE_TEMPLATE = """Given the following HTML script, you need to extract from it some elements of the webpage that can be interacted with.
An interactive web element is a button or a text input box, and you need to make full use of your knowledge of html to determine this. 

If there's a <title> or <h3> <h4> <h5> <h6> label, you must add that to element list.
If there's a <li> label with 'sidebar_wrapper_XXX' as its id, you must add that to element list.

Your output format should be [{{"element_name": "...", "element_description": "...", "element_css": "..."}}, {{"element_name": "...", "element_description": "...", "element_css": "..."}}, ...]
Element name should preferably be some text attribute, if not exist, a reasonable name should be inferred from other attributes like class / id / html structure. 
Element description should describe what the element does and where it is located, or the function of clicking / typing on it.
Element css should be its css path used for locating elements using selenium.

HTML script is:
{html_snippet}

Your output:
"""

In [17]:
from langchain.prompts import PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.chains import LLMChain

llm = ChatOpenAI(model_name=MODEL_NAME, temperature=TEMPERATURE)
prompt = PromptTemplate(input_variables=["html_snippet"], template=WEBPAGE_TEMPLATE)
chain = LLMChain(llm=llm, prompt=prompt)

HTML into chunks. Before running this, you can go to any page you want manually. Then get the element tool list recursively. This will take some time for every loop.

`TODO`: Using langchain text splitter and its overlap setting.

In [22]:
from tqdm import tqdm

def html_parser(all_node, driver, chain):
    html = html_cleaner(driver.page_source) # using html_cleaner
    chunk_num = int(num_tokens_from_string(str(html), model_name="gpt-4") / 4000)
    html_lines = html.split('\n')
    chunks = [html_lines[i:i+int(len(html_lines) / chunk_num)] for i in range(0, len(html_lines), int(len(html_lines) / chunk_num))]
    chunks = ['\n'.join(chunk) for chunk in chunks]

    for i in tqdm(range(len(chunks))):
        try:
            new_node_list = eval(chain.run(chunks[i]))
            
            for new_node in new_node_list:
                new_node["child_element"] = ""
                new_node["element_url"] = driver.current_url
                if new_node not in all_node:
                    all_node.append(new_node)
            
        except Exception as e:
            print(e)
            continue

    return all_node

Manually traverse the seven buttons in the left column as nodes in turn, and then build the whole tree manually in different csv.

In [8]:
sidebar_nodes = ["accounts", "campaigns", "analytics", "unibox", "settings", "accelerator"] # all sidebar elements

In [23]:
import pandas as pd
import time

for node in sidebar_nodes:
    driver.find_element(By.CSS_SELECTOR, f'li#sidebar_wrapper_{node}').click()
    time.sleep(1)

    all_node = []
    all_node = html_parser(all_node, driver=driver, chain=chain)

    df = pd.DataFrame(all_node)
    df.to_excel(f"data/{node}_KG.xlsx", index=False)

100%|██████████| 4/4 [02:44<00:00, 41.19s/it]

invalid syntax (<string>, line 1)



100%|██████████| 3/3 [02:39<00:00, 53.18s/it]
100%|██████████| 1/1 [01:22<00:00, 82.98s/it]


`TODO`: This part is not finished, don't play with it.

In [None]:
driver.execute_script("window.scrollBy(0,1000)")
last_height = driver.execute_script("return document.body.scrollHeight")

# whether we're at the bottom of page
while True:
    driver.execute_script("window.scrollBy(0, 1000)")
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height

driver.back() # get back to last page

In [None]:
driver.close()