<a href="https://colab.research.google.com/github/erlichsefi/ScrapeAnything/blob/main/browser_base_translation%20/%20JavaScript%20Converting%20Robust%20Test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Converting sites to dataframe

# first: install selenium & chromium

In [1]:
# According to: https://stackoverflow.com/questions/51046454/how-can-we-use-selenium-webdriver-in-colab-research-google-com
%%capture
%%shell
# Ubuntu no longer distributes chromium-browser outside of snap
#
# Proposed solution: https://askubuntu.com/questions/1204571/how-to-install-chromium-without-snap

# Add debian buster
cat > /etc/apt/sources.list.d/debian.list <<'EOF'
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster.gpg] http://deb.debian.org/debian buster main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-buster-updates.gpg] http://deb.debian.org/debian buster-updates main
deb [arch=amd64 signed-by=/usr/share/keyrings/debian-security-buster.gpg] http://deb.debian.org/debian-security buster/updates main
EOF

# Add keys
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys DCC9EFBF77E11517
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 648ACFD622F3D138
apt-key adv --keyserver keyserver.ubuntu.com --recv-keys 112695A0E562B32A

apt-key export 77E11517 | gpg --dearmour -o /usr/share/keyrings/debian-buster.gpg
apt-key export 22F3D138 | gpg --dearmour -o /usr/share/keyrings/debian-buster-updates.gpg
apt-key export E562B32A | gpg --dearmour -o /usr/share/keyrings/debian-security-buster.gpg

# Prefer debian repo for chromium* packages only
# Note the double-blank lines between entries
cat > /etc/apt/preferences.d/chromium.pref << 'EOF'
Package: *
Pin: release a=eoan
Pin-Priority: 500


Package: *
Pin: origin "deb.debian.org"
Pin-Priority: 300


Package: chromium*
Pin: origin "deb.debian.org"
Pin-Priority: 700
EOF

# Install chromium and chromium-driver
apt-get update
apt-get install chromium chromium-driver

# Install selenium
pip install selenium
apt install chromium-chromedriver
pip install pandas

In [41]:
script_with_logs = """
// Get all elements in the HTML page
const elements = document.getElementsByTagName('*');

// Create an array to store the element details
const elementDetails = [];

// Iterate through each element
for (let i = 0; i < elements.length; i++) {
  const element = elements[i];

  // Get the bounding rectangle of the element
  const rect = element.getBoundingClientRect();

  // Get the text content of the element
  const textContent = element.hasAttribute('textContent') ? element.getAttribute('textContent').trim() : '';

  // Get the tooltip value if it exists
  const tooltip = element.hasAttribute('title') ? element.getAttribute('title') : '';

  // Get the aria-label value
  const ariaLabel = (element.hasAttribute('aria-label') ? element.getAttribute('aria-label') : '')

  // Get the nodeName
  const e_type = (element.hasAttribute('nodeName') ? element.getAttribute('nodeName') : '')

  // Get the data-initial-value
  const data_initial_value = (element.hasAttribute('data-initial-value') ? element.getAttribute('data-initial-value') : '')

  // Get innerText
  const innerText = element.innerText

  // Store the element, its bounding rectangle, text content, and tooltip details
  const elementInfo = {
    element: element,
    rect: rect,
    textContent: textContent.replaceAll(",",";"),
    ariaLabel: ariaLabel.replaceAll(",",";"),
    tooltip: tooltip.replaceAll(",",";"),
    e_type: e_type.replaceAll(",",";"),
    data_initial_value: data_initial_value.replaceAll(",",";"),
    innerText: innerText !== undefined ? innerText.replaceAll(",",";"): "",
  };
  if ( elementInfo.rect.width > 0 && elementInfo.rect.height > 0){
    if (elementInfo.innerText != '' || elementInfo.data_initial_value != '' || elementInfo.tooltip != '' || elementInfo.textContent != '' || elementInfo.ariaLabel != ''){
      elementDetails.push(elementInfo);
    }

  }
}

let parents = elementDetails.map(e => e.element.parentElement);

const counts = {};
for (const num of parents) {
  counts[num] = counts[num] ? counts[num] + 1 : 1;
}

let withoutParents = elementDetails.filter(elementDetail => !(counts[elementDetail.element] == 1 && parents.includes(elementDetail.element)));
// Display the element details
console.log("X,Y,Top,Bottom,Left,Right,Width,Height,ElementType,textContent,TooltipValue,AriaLabel,data-initial-value");
console.log(withoutParents.map( e=> e.rect.x+","+e.rect.y+","+e.rect.top+","+e.rect.bottom+","+e.rect.left+","+e.rect.right+","+e.rect.width+","+e.rect.height+","+e.e_type+","+e.textContent+","+e.tooltip+","+e.ariaLabel+","+e.data_initial_value).join("\\n"));
"""

In [42]:
def write_to_text(filename,logs,error):
  with open(filename, 'w') as f:
    f.write(f"error={error}\n")
    f.write('\n'.join(logs))


def draw_all_rec_on_screenshot(file_name,list_of_elements):
  from PIL import Image, ImageDraw
  # Load the screenshot image
  screenshot = Image.open(file_name)
  # Create a drawing object
  draw = ImageDraw.Draw(screenshot)

  for _,elements in list_of_elements.iterrows():
    #print(elements)
    # x = elements['X']
    # y = elements['X']
    top = elements['Top']
    bottom = elements['Bottom']
    left = elements['Left']
    right = elements['Right']
    # width = elements['Width']
    # height = elements['Height']

    # Draw a rectangle on the screenshot image
    draw.rectangle([(left, top), (right, bottom)], outline='red')

    # Save the modified screenshot with the drawn rectangle
  screenshot.save(f"boxes_{file_name}")

In [43]:
def web_driver_to_image(wd,prefix=""):
  import re
  import os
  from urllib.parse import urlparse, urlunparse

  url = wd.current_url
  # Parse the URL
  parsed_url = urlparse(url)

  # Remove the query parameters
  clean_url = urlunparse((parsed_url.scheme, parsed_url.netloc, parsed_url.path, "", "", ""))
  file_name = re.sub(r'[^a-zA-Z0-9_-]', '_', clean_url)
  file_name = f"{prefix}{file_name}"

  wd.save_screenshot(f"{file_name}.png")
  return file_name

def get_browser():
    from selenium import webdriver
    from selenium.webdriver.chrome.service import Service

    # restart the browser
    service = Service(executable_path=r'/usr/bin/chromedriver')
    chrome_options = webdriver.ChromeOptions()
    chrome_options.add_argument('--headless=new')
    chrome_options.add_argument('--no-sandbox')
    chrome_options.headless = True
    wd = webdriver.Chrome(service=service,options=chrome_options)

    return wd

def screen_to_table(wd,url,raw=False):

  import pandas as pd
  import io

  for timeout in range(10,60,20):
    try:
      wd.set_page_load_timeout(timeout)
      wd.implicitly_wait(timeout)  # Set a timeout of 10 seconds
      wd.get(url)
      break

    except Exception:
      pass

  file_name = web_driver_to_image(wd)

  script = f"""
  var consoleLogs = [];
  var originalLog = console.log;
  console.log = function(message) {{
      consoleLogs.push(message);
      originalLog.apply(console, arguments);
  }};

  {script_with_logs}

  return consoleLogs;
  """

  logs = wd.execute_script(script)

  if not raw:
    df = pd.read_csv(io.StringIO("\n".join(logs)), sep=",")
    draw_all_rec_on_screenshot(f"{file_name}.png",df)
  else:
    df = logs
  #wd.close()
  return file_name,df

In [44]:
# wd = get_browser()
# df = screen_to_table(wd,"https://www.cnn.com")

  chrome_options.headless = True


In [45]:
news_sites = [
    "https://www.n12.co.il/",
    "https://www.cnn.com",
    "https://www.bbc.co.uk/news",
    "https://www.nytimes.com",
    "https://www.aljazeera.com",
    "https://www.reuters.com",
    "https://www.theguardian.com",
    "https://www.foxnews.com",
    "https://www.nbcnews.com",
    "https://www.bloomberg.com",
    "https://apnews.com",
    "https://www.huffpost.com",
    "https://www.usatoday.com",
    "https://www.washingtonpost.com",
    "https://www.wsj.com",
    "https://abcnews.go.com",
    "https://www.buzzfeednews.com",
    "https://www.cbsnews.com",
    "https://www.npr.org",
    "https://www.ft.com",
    "https://www.independent.co.uk",
]

In [47]:
wd = get_browser()

for url in news_sites:
  print(f" ---- URL = {url} ---- ")

  try:
    file_name,df = screen_to_table(wd,url)
    csv_file_name = f"{file_name}.csv"
    screenshot_file_name = f"{file_name}.png"
    df.to_csv(csv_file_name)

    print(f"screenshot_file_name = {screenshot_file_name}.")
    print(f"Parsing was sucsessful, csv_file_name = {csv_file_name}.")
  except Exception as e:
    file_name,df = screen_to_table(wd,url,raw=True)
    txt_file_name = f"{file_name}.txt"
    write_to_text(txt_file_name,df,e)


    print(f"screenshot_file_name = {screenshot_file_name}.")
    print(f"Parsing was un-sucsessful, txt_file_name = {txt_file_name}.")

  print(f"please make sure all text in screenshot is explained by output file")

  chrome_options.headless = True


 ---- URL = https://www.n12.co.il/ ---- 
screenshot_file_name = https___www_n12_co_il_.png.
Parsing was sucsessful, csv_file_name = https___www_n12_co_il_.csv.
please make sure all text in screenshot is explained by output file
 ---- URL = https://www.cnn.com ---- 
screenshot_file_name = https___www_cnn_com_.png.
Parsing was sucsessful, csv_file_name = https___www_cnn_com_.csv.
please make sure all text in screenshot is explained by output file
 ---- URL = https://www.bbc.co.uk/news ---- 
screenshot_file_name = https___www_bbc_com_news.png.
Parsing was sucsessful, csv_file_name = https___www_bbc_com_news.csv.
please make sure all text in screenshot is explained by output file
 ---- URL = https://www.nytimes.com ---- 
screenshot_file_name = https___www_nytimes_com_.png.
Parsing was sucsessful, csv_file_name = https___www_nytimes_com_.csv.
please make sure all text in screenshot is explained by output file
 ---- URL = https://www.aljazeera.com ---- 
screenshot_file_name = https___www_alj

KeyboardInterrupt: ignored

# Here is some issues found in manual analysis

In [None]:
def describe(site):
  from IPython.display import Image, display
  display(Image(filename=f"{site}.png"))

  import pandas as pd
  return pd.read_csv(f"{site}.csv")

In [None]:
describe("https___www_aljazeera_com_")

Issues to address:
 - Where is 'all bets are off'?
 - Can we add an 'z' axis? or if there is any overlay describe only the top overlay?


In [None]:
describe("https___www_ft_com_")

Issues:
  - Where is the 'Accept cookies' / 'mangage cookies'?
  - Where is the 'sign in' / 'subscribe'?

In [None]:
describe("https___www_nytimes_com_")

Issues:
  - Where is the text on the left of the screen?