In [5]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
import json
import undetected_chromedriver as uc
import time
from random import randint
import dataclasses as dc
from datetime import datetime, date
from pathlib import Path
import logging


## General functions

Here we define a general function to scroll pages and detect type of article.

In [None]:
@dc.dataclass
class WebPage:
    website: str = None
    url: str = None
    link: str = None
    title: str = None
    media_type: str = None
    date: str =None
    content: str = None
   
class Scraper:

    def click_show_more_button(self, driver) -> bool:
        try:
            button = driver.find_element(By.XPATH, '//button[@class="show-more-button big-margin"]')
            if button.is_displayed():
                button.click()
                time.sleep(1)
                return True
        
        except Exception as e:
            pass
        return False
            
    def scroll_method(self, driver, scraper_function=None, scrape_object=None, filename=None):
        try:
            last_height = driver.execute_script("return document.body.scrollHeight")
            same_height_count = 0
            button_pressed_count = 0

            while True:
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                time.sleep(1)

                button_clicked = self.click_show_more_button(driver)
                if button_clicked:
                    button_pressed_count += 1

                if scraper_function and scrape_object:
                    result, stop_flag = scraper_function(driver, scrape_object)
                    self.write_to_jsonl(result, filename)
                    if stop_flag:
                        logger.info("Criteria met, terminating infinity scroll.")
                        break

    
                new_height = driver.execute_script("return document.body.scrollHeight")
            
                if new_height == last_height:
                    same_height_count += 1
                    if same_height_count >= 3:
                        logger.info("Reached the end of the page.")
                        break
                else:
                    same_height_count = 0
                    last_height = new_height
                    

        except Exception as e:
            print("Error:", e)

        
    def write_to_jsonl(self, result: list[WebPage] | WebPage | Exception, filename: str):
        """Writes the result of the scraper function to a jsonl file. 
        If result is a list, each element is written to the file. 
        If result is a dict, it is written as a single line.
        """
        CURRENT_DIR = Path().resolve()
        PROJECT_ROOT = CURRENT_DIR.parent
        #PROJECT_ROOT = Path(__file__).resolve().parent.parent  # Adjust as needed

        output_dir = PROJECT_ROOT / 'output'
        output_dir.mkdir(exist_ok=True)  # Create it if it doesn't exist
        logs_dir = PROJECT_ROOT / 'logs'
        logs_dir.mkdir(exist_ok=True)  # Create it if it doesn't exist

        # Check if the file already exists
        if not (output_dir / f'{filename}.jsonl').exists():
            with open(f'{output_dir}/{filename}.jsonl', 'w') as f:
                f.write('')
                f.close()
        if not (logs_dir / f'{filename}_error.jsonl').exists():
            with open(f'{logs_dir}/{filename}_error.jsonl', 'w') as f:
                f.write('')
                f.close()
        if not (logs_dir / f'{filename}_captcha.jsonl').exists():
            with open(f'{logs_dir}/{filename}_captcha.jsonl', 'w') as f:
                f.write('')
                f.close()
        


        if result:
            with open(f'{output_dir}/{filename}.jsonl', 'a') as f:        
                if isinstance(result, list):
                    result = [dc.asdict(r) for r in result]
                    [f.write(json.dumps(r, ensure_ascii=False) + '\n') for r in result]

                elif isinstance(result, WebPage):
                    f.write(json.dumps(dc.asdict(result), ensure_ascii=False) + '\n')
                    f.close()
        
                elif isinstance(result, Exception):
                    with open(f'{logs_dir}/{filename}_error.jsonl', 'a') as f:
                        f.write(json.dumps({'error': str(result)}, ensure_ascii=False) + '\n')
                        f.close()

        else:
            with open(f'{logs_dir}/{filename}_captcha.jsonl', 'a') as f:
                if isinstance(result, list):
                    for r in result:
                        f.write(r + '\n')
                    f.close()
                else:    
                    f.write(result + '\n')
                    f.close()

    def run(self, scrape_object: WebPage, scraper_function, filename: str, incremental: bool):
        # This function is used to run the scraper
        self.driver  = uc.Chrome(headless=False,use_subprocess=False)
        self.driver.get(scrape_object.link)

        ## If page is infinite scroll, we scroll down to the page, write results, press button, and repeat until stop condition is met.
        if incremental:
            self.scroll_method(
                driver=self.driver,
                scraper_function=scraper_function,
                scrape_object=scrape_object,
                filename=filename
            )

        else:
            self.scroll_method(driver=self.driver)
            result =  scraper_function(self.driver, scrape_object)
            self.write_to_jsonl(result, filename)
        
        self.driver.quit()

class Database:
    def check_if_exists(link: str) -> bool:
        # This function is used to check if the link already exists in the database
        with open('output/israeli_times_links.jsonl', 'r') as f:
            for line in f:
                if json.loads(line)['link'] == link:
                    return True
        return False    


class IsraeliTimesScraper():
    def __init__(self):
        self.scraper = Scraper()

    def detect_type_article(self, link: str) -> str:
        # This function is used to detect the type of article
        if 'liveblog' in link:
            return 'liveblog'
        elif 'blogs.timesofisrael.com' in link:
            return 'blog'
        elif 'https://jewishchronicle.timesofisrael.com/' in link:
            return 'jewishchronicle'
        else:
            return 'article'

    def collect_page_titles(self, driver, _: WebPage) -> list[WebPage]:
        result = []
        h = driver.find_elements(By.XPATH, '//div[@class="headline"]/a')
        already_scraped_count =  0
        for link in h:
            href = link.get_attribute('href')
            if Database.check_if_exists(href):
                already_scraped_count += 1
                continue

            type_of_article = self.detect_type_article(href)

            result.append(
                WebPage(
                website='timesofisrael',
                url=driver.current_url,
                date=None,
                title=link.text,
                link=href,
                media_type=type_of_article,
                content=None
            )
            )
        unique_domains = []
        unique = set()
        for r in result:
            if r.link not in unique:
                unique.add(r.link)
                unique_domains.append(r)
        print("Already scraped:", already_scraped_count)
        
        if len(result) == 0:
            return Exception('Already scraped all articles on this page')
        
        print('Collecting:', len(result), 'articles from the page')

        return unique_domains

    def scrape_article(self, driver, scrape_object: WebPage) -> WebPage:
        """Scrapes title, content, date from the article page."""
        try:
            title = driver.find_element(By.XPATH, '//h1[@class="headline"]').text
            content = driver.find_element(By.XPATH, '//div[@class="the-content"]').text
            date = driver.find_element(By.XPATH, '//span[@class="date"]').text
            article = WebPage(
                website='timesofisrael',
                title =title,
                date=date,
                link=scrape_object.link,
                media_type=scrape_object.media_type,
                content=content
            )

            if "Today" in article.date:
                new_date = datetime.date.today().strftime("%Y-%m-%d")
                article.date = new_date

            return article
        
        except Exception as e:
            return f"'Error': {e}, 'title', {scrape_object.title}, 'link', {scrape_object.link} \n"

    def collect_liveblog(self, driver, scrape_object: WebPage) -> list[WebPage]:
        """Scrapes title, content, date from the liveblog page."""
        try:
            title = driver.find_elements(By.XPATH, '//div[@class="liveblog-paragraph"]//h4')
            content = driver.find_elements(By.XPATH, '//div[@class="liveblog-paragraph"]//p')
            href = driver.find_elements(By.XPATH, '//div[@class="liveblog-paragraph"]//h4//a')
            dates = driver.find_elements(By.XPATH, '//div[@class="liveblog-date"]//a//span')
            result = []
            print(
                "title:", len(title), 
                "content:", len(content), 
                "link:", len(href), 
                "date:", len(dates)
                )

            for t, i, h, d in zip(title, content, href, dates): 
                
                # Convert epoch in timestamp to datetime
                title = t.text
                content = ''.join(i.text)
                href = h
                timestamp = int(d.get_attribute('data-timestamp'))
                dt_object = datetime.datetime.utcfromtimestamp(timestamp)
                epoch = dt_object.strftime('%Y-%m-%d %H:%M:%S')

            
                result.append(WebPage(
                    website='timesofisrael',
                    url=driver.current_url,
                    title =title,
                    date=epoch,
                    link=href.get_attribute('href'),
                    media_type='liveblog',
                    content=content
                ))
            
            return result
        except Exception as e:
            return f"'Error': {e}, 'title', {scrape_object.title}, 'link', {scrape_object.link} \n"

    def collect_blogs(self, driver, scrape_object: WebPage) -> WebPage:
        """Scrapes title, content, date from the blog page."""
        try:
            title = driver.find_element(By.XPATH, '//h1[@class="headline"]').text
            content = driver.find_element(By.XPATH, '//div[@class="article-content"]').text
            date = driver.find_element(By.XPATH, '//aside[@class="block cols1"]//div[@class="date"]').text
            article = WebPage(
                website='timesofisrael',
                title =title,
                date=date,
                link=scrape_object.link,
                media_type=scrape_object.media_type,
                content=content
            )  
            return article
            
        except Exception as e:
            return f"'Error': {e}, 'title', {scrape_object.title}, 'link', {scrape_object.link} \n"



    def run(self):
        homepage = WebPage(link='https://www.timesofisrael.com/', media_type='homepage')
        self.scraper.run(homepage, self.collect_page_titles, 'israeli_times_links')
        with open('output/israeli_times_links.jsonl', 'r') as f:
            for line in f:
                page = json.loads(line)
                page = WebPage(link=page['link'], media_type=page['media_type'])
                if page.media_type == 'article':
                    self.scraper.run(page, self.scrape_article, 'data')
                elif page.media_type == 'liveblog':
                    self.scraper.run(page, self.collect_liveblog, 'data')
                elif page.media_type == 'blog':
                    self.scraper.run(page, self.collect_blogs, 'data')
                
                
                time.sleep(randint(1, 3))



class AljazeeraScraper():
    def __init__(self):
        self.scraper = Scraper()

    def convert_date(self, date_str: str) -> datetime.date:
        for fmt in ("%d %B %Y", "%d %b %Y"):  # Try full and short month names
            try:
                return datetime.strptime(date_str, fmt).date()
            except ValueError:
                continue
        raise ValueError(f"Date format not recognized: {date_str}")


    def collect_page_titles(self, driver, _: WebPage) -> tuple[list[WebPage], bool]:
        try:
            result = []
            stop_flag = False
            titles = driver.find_elements(By.XPATH, '//h3[@class="gc__title"]')
            hrefs = driver.find_elements(By.XPATH, '//a[@class="u-clickable-card__link"]')
            dates = driver.find_elements(By.XPATH, '//div[@class="date-simple"]//span[@aria-hidden="true"]')
         
            for title, link, article_date in zip(titles, hrefs, dates):
                result.append(
                    WebPage(
                        website='aljazeera',
                        url=driver.current_url,
                        date=article_date.text,
                        title=title.text,
                        link=link.get_attribute('href'),
                    )
                )  
                result_date = self.convert_date(article_date.text)


                cut_off_date = date(2024, 9, 7)
                if result_date < cut_off_date:
                    stop_flag = True
                    return result, stop_flag

            return result, stop_flag
        

        except Exception as e:
            logger.error(f"Error:", {e})
            return [], False
        
                        
    def run(self):
        homepage = WebPage(link='https://www.aljazeera.com/tag/israel-palestine-conflict/', media_type='homepage')
        self.scraper.run(homepage, self.collect_page_titles, 'aljazeera_links', incremental=True)
        time.sleep(randint(1, 3))




logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
file_handler = logging.FileHandler('scraper.log')
file_handler.setLevel(logging.INFO)
# Create a formatter and set it for the handler
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)
# Add the handler to the logger
logger.addHandler(file_handler)



AljazeeraScraper().run()


Scrolling@
Pressing button
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-17
2025-05-17
2025-05-17
2025-05-17
2025-05-17
2025-05-17
2025-05-17
2025-05-16
2025-05-16
2025-05-16
2025-05-16
2025-05-16
2025-05-16
2025-05-16
2025-05-15
2025-05-15
Scrolling@
Pressing button
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-17
2025-05-17
2025-05-17
2025-05-17
2025-05-17
2025-05-17
2025-05-17
2025-05-16
2025-05-16
2025-05-16
2025-05-16
2025-05-16
2025-05-16
2025-05-16
2025-05-15
2025-05-15
2025-05-15
2025-05-15
2025-05-15
2025-05-15
2025-05-15
2025-05-15
2025-05-15
2025-05-15
2025-05-15
2025-05-15
Scrolling@
Pressing button
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-18
2025-05-17
2025-05-17
2025-05-17
2025-05-17
2025-05-17
2025-05-17
2025-05-17
2025-05-16
2025-05-16
2025-05-16
2025-05-16
2025-05-16
2025-05-16
2025-05-16
2025-05-15
2025-05-15
2025-05-15
2025-0

--- Logging error ---
Traceback (most recent call last):
  File "/home/albin/projects/israel/.env/lib/python3.11/site-packages/urllib3/connectionpool.py", line 787, in urlopen
    response = self._make_request(
               ^^^^^^^^^^^^^^^^^^^
  File "/home/albin/projects/israel/.env/lib/python3.11/site-packages/urllib3/connectionpool.py", line 534, in _make_request
    response = conn.getresponse()
               ^^^^^^^^^^^^^^^^^^
  File "/home/albin/projects/israel/.env/lib/python3.11/site-packages/urllib3/connection.py", line 516, in getresponse
    httplib_response = super().getresponse()
                       ^^^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/http/client.py", line 1395, in getresponse
    response.begin()
  File "/usr/lib/python3.11/http/client.py", line 325, in begin
    version, status, reason = self._read_status()
                              ^^^^^^^^^^^^^^^^^^^
  File "/usr/lib/python3.11/http/client.py", line 294, in _read_status
    raise RemoteDisconnec

Error: HTTPConnectionPool(host='localhost', port=52465): Max retries exceeded with url: /session/fce910dfd394df200b27b9a14553853b/execute/sync (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x7a99706becd0>: Failed to establish a new connection: [Errno 111] Connection refused'))
