In [16]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import json
import undetected_chromedriver as uc
import time
from random import randint
import dataclasses as dc

## General functions

Here we define a general function to scroll pages and detect type of article.

In [17]:
@dc.dataclass
class WebPage:
    website: str = None
    url: str = None
    link: str = None
    title: str = None
    media_type: str = None
    date: str=None
    content: str = None



class Scraper:
        
    def scroll_method(self, driver):
        last_height = 0
        while True:
            driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
            time.sleep(1)
            new_height = driver.execute_script("return document.body.scrollHeight")
            if new_height == last_height:
                break
            else:
                last_height = new_height

    def write_to_jsonl(self, result: list[WebPage] | WebPage | Exception, filename: str):
        """Writes the result of the scraper function to a jsonl file. 
        If result is a list, each element is written to the file. 
        If result is a dict, it is written as a single line.
        """
        if result:
            with open(f'{filename}.jsonl', 'a') as f:        
                if isinstance(result, list):
                    result = [dc.asdict(r) for r in result]
                    [f.write(json.dumps(r, ensure_ascii=False) + '\n') for r in result]

                elif isinstance(result, dict):
                    f.write(json.dumps(result, ensure_ascii=False) + '\n')
                    f.close()
        
                elif isinstance(result, Exception):
                    with open(f'{filename}_error.jsonl', 'a') as f:
                        f.write(json.dumps({'error': str(result)}, ensure_ascii=False) + '\n')
                        f.close()

        else:
            with open(f'{filename}_captcha.jsonl', 'a') as f:    
                    f.write(json.dumps(result, ensure_ascii=False) + '\n')
                    f.close()

    def run(self, scrape_object: WebPage, scraper_function, filename: str):
        # This function is used to run the scraper
        self.driver  = uc.Chrome(headless=False,use_subprocess=False)
        self.driver.get(scrape_object.link)
        self.scroll_method(self.driver)
        result =  scraper_function(self.driver, scrape_object)
        self.write_to_jsonl(result, filename)
        self.driver.quit()


class Database:
    def check_if_exists(link: str) -> bool:
        # This function is used to check if the link already exists in the database
        with open('israeli_times_links.jsonl', 'r') as f:
            for line in f:
                if json.loads(line)['link'] == link:
                    return True
        return False    


class IsraeliTimesScraper():
    def __init__(self):
        self.scraper = Scraper()

    def detect_type_article(self, link: str) -> str:
        # This function is used to detect the type of article
        if 'liveblog' in link:
            return 'liveblog'
        elif 'blogs.timesofisrael.com' in link:
            return 'blog'
        elif 'https://jewishchronicle.timesofisrael.com/' in link:
            return 'jewishchronicle'
        else:
            return 'article'

    def collect_page_titles(self, driver, _: WebPage) -> list[WebPage]:
        result = []
        h = driver.find_elements(By.XPATH, '//div[@class="headline"]/a')
        already_scraped_count =  0
        for link in h:
            href = link.get_attribute('href')
            if Database.check_if_exists(href):
                already_scraped_count += 1
                continue

            type_of_article = self.detect_type_article(href)

            result.append(
                WebPage(
                website='timesofisrael',
                url=driver.current_url,
                date=None,
                title=link.text,
                link=href,
                media_type=type_of_article,
                content=None
            )
            )

        print("Already scraped:", already_scraped_count)
        if len(result) == 0:
            return Exception('Already scraped all articles on this page')
        
        print('Collecting:', len(result), 'articles from the page')

        return result

    def scrape_article(driver, scrape_object: WebPage) -> WebPage:
        """Scrapes title, content, date from the article page."""
        try:
            title = driver.find_element(By.XPATH, '//h1[@class="headline"]').text
            content = driver.find_element(By.XPATH, '//div[@class="the-content"]').text
            date = driver.find_element(By.XPATH, '//span[@class="date"]').text
            return WebPage(
                title =title,
                date=date,
                link=scrape_object.link,
                type=scrape_object.type,
                content=content
            )
        except Exception as e:
            print(f"Error: {e}")
            return None


    def run(self):
        homepage = WebPage(link='https://www.timesofisrael.com/', media_type='homepage')
        self.scraper.run(homepage, self.collect_page_titles, 'israeli_times_links')


IsraeliTimesScraper().run()




Already scraped: 0
Collecting: 457 articles from the page


In [18]:
def detect_type_article(link: str) -> str:
    # This function is used to detect the type of article
    if 'liveblog' in link:
        return 'liveblog'
    elif 'blogs.timesofisrael.com' in link:
        return 'blog'
    elif 'https://jewishchronicle.timesofisrael.com/' in link:
        return 'jewishchronicle'
    else:
        return 'article'


### First step is to collect URLs from the home page.

In [19]:
def collect_page_titles() -> list[dict]:
    result = []
    h = driver.find_elements(By.XPATH, '//div[@class="headline"]/a')
    for link in h:
        result.append({
            'title': link.text, 
            'link': link.get_attribute('href'),
            'type': detect_type_article(link.get_attribute('href')),
        })
    return result


driver  = webdriver.Chrome()
titles_to_scrape: list[dict] = collect_page_titles()
with open('result.jsonl', 'w') as f:
    for i in titles_to_scrape:
        f.write(json.dumps(i, ensure_ascii=False) + '\n')

    f.close()
    
driver.quit()

In [20]:
def scroll_method(driver):
    """Scrolls the page to the bottom"""
    last_height = 0
    while True:
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        else:
            last_height = new_height


def scrape_article(driver, scrape_object: dict) -> dict:
    """Scrapes title, content, date from the article page."""
    driver.get(scrape_object['link'])
    try:
        title = driver.find_element(By.XPATH, '//h1[@class="headline"]').text
        content = driver.find_element(By.XPATH, '//div[@class="the-content"]').text
        date = driver.find_element(By.XPATH, '//span[@class="date"]').text
        return {
            'title': title,
            'date': date,
            'link': scrape_object['link'],
            'type': scrape_object['type'],
            'content': content
        }
    except Exception as e:
        print(f"Error: {e}")
        return None


def scraper_wrapper(scrape_object: dict, scraper_function: callable):
    """Wrapper for scraping web pages. Takes a scraper object and the function to scrape the page. 
    Writes the result of the scraper function to a jsonl file. 
    If scraper fails, likely due to captcha, the scrape_object is written to captcha.jsonl for debuging and retrials.
    """
    driver  = uc.Chrome(headless=False,use_subprocess=False)
    driver.get(scrape_object['link'])
    scroll_method(driver)

    result =  scraper_function(driver, scrape_object)
    if result:
        with open('test.jsonl', 'a') as f:    
                f.write(json.dumps(result, ensure_ascii=False) + '\n')
                f.close()
    else:
        with open('captchas.jsonl', 'a') as f:    
                f.write(json.dumps(scrape_object, ensure_ascii=False) + '\n')
                f.close()

    driver.quit()


dicts = []
with open('result.jsonl', 'r') as f:
    for line in f:
        dicts.append(json.loads(line))
    f.close()

for i in dicts:
    if i['type'] == 'article':
         scraper_wrapper(i, scrape_article)
    
    time.sleep(randint(2, 5))
    print(f"Scraped {i['link']}")

```   author = driver.find_element(By.XPATH, '//span[@class="byline"]/a').text
```
Note to self that author can have several <a> elements if more than one author