In [2]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import json
import undetected_chromedriver as uc
import time
from random import randint

## General functions

Here we define a general function to scroll pages and detect type of article.

In [29]:
class Scraper:
    def __init__(self, link: str, scrape_object, scraper_function: callable, filename: str):      
        self.link = link
        self.scrape_object = scrape_object
        self.scraper_function = scraper_function
        self.driver = None
        self.filename = filename
        

    def scroll_method(self, driver):
        last_height = 0
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            else:
                last_height = new_height


    def write_to_jsonl(self, result: list[dict] | dict, filename: str):
        """Writes the result of the scraper function to a jsonl file. 
        If result is a list, each element is written to the file. 
        If result is a dict, it is written as a single line.
        """
        if result:
            with open(f'{filename}.jsonl', 'a') as f:        
                if isinstance(result, list):
                    [f.write(json.dumps(r, ensure_ascii=False) + '\n') for r in result]
                else:
                        f.write(json.dumps(result, ensure_ascii=False) + '\n')
                        f.close()
        else:
            with open(f'{filename}_captcha.jsonl', 'a') as f:    
                    f.write(json.dumps(result, ensure_ascii=False) + '\n')
                    f.close()


    def run_scraper(self):
        """Wrapper for scraping web pages. Takes a scraper object and the function to scrape the page. 
        Writes the result of the scraper function to a jsonl file. 
        If scraper fails, likely due to captcha, the scrape_object is written to captcha.jsonl for debuging and retrials.
        """
        
        self.driver  = uc.Chrome(headless=False,use_subprocess=False)
        self.driver.get(self.link)
        self.scroll_method(self.driver)
        result =  self.scraper_function(self.driver, self.scrape_object)
        self.write_to_jsonl(result, self.filename)
        self.driver.quit()


def detect_type_article(link: str) -> str:
    # This function is used to detect the type of article
    if 'liveblog' in link:
        return 'liveblog'
    elif 'blogs.timesofisrael.com' in link:
        return 'blog'
    elif 'https://jewishchronicle.timesofisrael.com/' in link:
        return 'jewishchronicle'
    else:
        return 'article'


def collect_page_titles(driver, scrape_object) -> list[dict]:
    result = []
    h = driver.find_elements(By.XPATH, '//div[@class="headline"]/a')
    for link in h:
        scrape_object = {
            'title': link.text, 
            'link': link.get_attribute('href'),
            'type': detect_type_article(link.get_attribute('href')),
        }
        result.append(scrape_object)
    return result


def scrape_article(driver, scrape_object: dict) -> dict:
    """Scrapes title, content, date from the article page."""
    driver.get(scrape_object['link'])
    try:
        title = driver.find_element(By.XPATH, '//h1[@class="headline"]').text
        content = driver.find_element(By.XPATH, '//div[@class="the-content"]').text
        date = driver.find_element(By.XPATH, '//span[@class="date"]').text
        return {
            'title': title,
            'date': date,
            'link': scrape_object['link'],
            'type': scrape_object['type'],
            'content': content
        }
    except Exception as e:
        print(f"Error: {e}")
        return None


titles = Scraper('https://www.timesofisrael.com/', {}, collect_page_titles, 'titles')
article = Scraper('https://www.timesofisrael.com/3-soldiers-wounded-2-seriously-in-west-bank-attacks/', {'link': 'https://www.timesofisrael.com/3-soldiers-wounded-2-seriously-in-west-bank-attacks/', 'type': 'article'}, scrape_article, 'article')
article.run_scraper()

In [None]:
def detect_type_article(link: str) -> str:
    # This function is used to detect the type of article
    if 'liveblog' in link:
        return 'liveblog'
    elif 'blogs.timesofisrael.com' in link:
        return 'blog'
    elif 'https://jewishchronicle.timesofisrael.com/' in link:
        return 'jewishchronicle'
    else:
        return 'article'


### First step is to collect URLs from the home page.

In [None]:
def collect_page_titles() -> list[dict]:
    result = []
    h = driver.find_elements(By.XPATH, '//div[@class="headline"]/a')
    for link in h:
        result.append({
            'title': link.text, 
            'link': link.get_attribute('href'),
            'type': detect_type_article(link.get_attribute('href')),
        })
    return result


driver  = webdriver.Chrome()
titles_to_scrape: list[dict] = collect_page_titles()
with open('result.jsonl', 'w') as f:
    for i in titles_to_scrape:
        f.write(json.dumps(i, ensure_ascii=False) + '\n')

    f.close()
    
driver.quit()

In [None]:
def scroll_method(driver):
    """Scrolls the page to the bottom"""
    last_height = 0
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        else:
            last_height = new_height


def scrape_article(driver, scrape_object: dict) -> dict:
    """Scrapes title, content, date from the article page."""
    driver.get(scrape_object['link'])
    try:
        title = driver.find_element(By.XPATH, '//h1[@class="headline"]').text
        content = driver.find_element(By.XPATH, '//div[@class="the-content"]').text
        date = driver.find_element(By.XPATH, '//span[@class="date"]').text
        return {
            'title': title,
            'date': date,
            'link': scrape_object['link'],
            'type': scrape_object['type'],
            'content': content
        }
    except Exception as e:
        print(f"Error: {e}")
        return None


def scraper_wrapper(scrape_object: dict, scraper_function: callable):
    """Wrapper for scraping web pages. Takes a scraper object and the function to scrape the page. 
    Writes the result of the scraper function to a jsonl file. 
    If scraper fails, likely due to captcha, the scrape_object is written to captcha.jsonl for debuging and retrials.
    """
    driver  = uc.Chrome(headless=False,use_subprocess=False)
    driver.get(scrape_object['link'])
    scroll_method(driver)

    result =  scraper_function(driver, scrape_object)
    if result:
        with open('test.jsonl', 'a') as f:    
                f.write(json.dumps(result, ensure_ascii=False) + '\n')
                f.close()
    else:
        with open('captchas.jsonl', 'a') as f:    
                f.write(json.dumps(scrape_object, ensure_ascii=False) + '\n')
                f.close()

    driver.quit()


dicts = []
with open('result.jsonl', 'r') as f:
    for line in f:
        dicts.append(json.loads(line))
    f.close()

for i in dicts:
    if i['type'] == 'article':
         scraper_wrapper(i, scrape_article)
    
    time.sleep(randint(2, 5))
    print(f"Scraped {i['link']}")

Scraped https://www.timesofisrael.com/israel-claims-its-promoting-palestinian-emigration-from-gaza-so-why-are-so-few-leaving/
Scraped https://www.timesofisrael.com/liveblog-may-09-2025/
Scraped https://www.timesofisrael.com/two-idf-soldiers-killed-six-wounded-in-southern-gaza-fighting/
Scraped https://www.timesofisrael.com/israeli-plan-to-initially-only-feed-60-of-gazans-as-they-endure-extreme-deprivation/
Scraped https://www.timesofisrael.com/leading-agency-shuts-its-gaza-soup-kitchens-amid-continued-israeli-aid-ban/
Scraped https://www.timesofisrael.com/eu-to-review-trade-ties-with-israel-following-criticism-of-conduct-in-war-in-gaza/
Scraped https://www.timesofisrael.com/us-pressuring-humanitarian-groups-to-get-behind-israeli-aid-plan-for-gaza/
Scraped https://www.timesofisrael.com/israel-claims-its-promoting-palestinian-emigration-from-gaza-so-why-are-so-few-leaving/
Scraped https://www.timesofisrael.com/liveblog-may-09-2025/
Scraped https://www.timesofisrael.com/two-idf-soldiers-k

In [87]:
dicts = []
with open('test.jsonl', 'r') as f:
    for line in f:
        dicts.append(json.loads(line))
    f.close()

count = 0
for i in dicts:
    if i:
        count += 1
print(
      f'\n {time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())}'
      f'\n \t Success rate: {round(count/len(dicts)*100, 1)}%' 
      f'\n \t Articles scraped: {count}'
      f'\n \t Articles with errors: {len(dicts) - count}'
      f'\n \t Total number of articles: {len(dicts)} \n'
      )


 2025-05-11 08:39:38
 	 Success rate: 100.0%
 	 Articles scraped: 368
 	 Articles with errors: 0
 	 Total number of articles: 368 



```   author = driver.find_element(By.XPATH, '//span[@class="byline"]/a').text
```
Note to self that author can have several <a> elements if more than one author