# Areios Pagos Crawling

In [46]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup
from pprint import pprint

import pandas as pd
import time
import re

In [47]:
def open_firefox():
        firefox_options = Options()
        driver_path = r'C:\Program Files\geckodriver-v0.35.0-win64\geckodriver.exe'
        service = Service(driver_path)
        driver = webdriver.Firefox(service=service, options=firefox_options)
        return driver

In [39]:
def search_decisions(driver, year="2024"):
        driver.get("https://areiospagos.gr/nomologia/apofaseis.asp")
        wait = WebDriverWait(driver, 10)

        # fill 2024 as the year to be searched
        year_input = wait.until(expected_conditions.presence_of_element_located((By.NAME, 'x_ETOS')))
        year_input.clear()
        year_input.send_keys(year)

        # select 'ΟΛΕΣ' stis 'Αποφάσεις'
        select_tmhma = Select(driver.find_element(By.NAME, 'X_TMHMA'))
        select_tmhma.select_by_value('6')

        # select 'ΟΛΕΣ' sto 'Τμήμα'
        select_sub_tmhma = Select(driver.find_element(By.NAME, 'X_SUB_TMHMA'))
        select_sub_tmhma.select_by_value('1')

        search_button = driver.find_element(By.NAME, 'submit_krit')
        search_button.click()

        time.sleep(2)
        # print(driver.page_source)
        return driver.page_source

# Usage
driver = open_firefox()
html_content = search_decisions(driver, "2024") 

In [None]:
def extract_decision_links(html_content):
        soup = BeautifulSoup(html_content, 'html.parser')
        links = []

        for a_tag in soup.select('a.blue10_cursor[href^="apofaseis_DISPLAY.asp"]'):
                decision_id = a_tag.text.strip()
                info = a_tag.get('href').split('info')[1] if 'info=' in a_tag.get('href') else ""

                href = a_tag.get('href')

                url = f"https://areiospagos.gr/nomologia/{href}"

                links.append({
                        'decision_id': decision_id,
                        'info': info,
                        'url': url
                })

        return links

links = extract_decision_links(html_content)
print(len(links))
pprint(links)

In [None]:
# TO DO: Break it into smaller methods e.g. one method per column needed
def extract_decision_details(driver, url):
        driver.get(url)
        time.sleep(1)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        decision_text = soup.get_text()

        decision_match = re.search(r'Απόφαση\s+(\d+)\s*/\s*(\d{4})', decision_text)
        if decision_match:
                number = decision_match.group(1)
                year = decision_match.group(2)
        else:
                number = "Unknown"
                year = "Unknown"
        
        dept_match = re.search(r'\((.*?),\s*(.*?)\)', decision_text)
        if dept_match:
                dept_code = dept_match.group(1)
                dept_type = dept_match.group(2)
        else:
                dept_match_alt = re.search(r'(Β\d+\'?\s*[Π|π]ολιτικό\s*Τμήμα)', decision_text)
                if dept_match_alt:
                        dept_parts = dept_match_alt.group(1).split()
                        dept_code = dept_parts[0] if dept_parts else "Unknown"
                        dept_type = "ΠΟΛΙΤΙΚΕΣ" if "πολιτικό" in dept_match_alt.group(1).lower() else "Unknown"
                else:
                        dept_code = "Unknown"
                        dept_type = "Unknown"

        judges_match = re.search(
        r'(?:Συγκροτήθηκε|ΣΥΓΚΡΟΤΗΘΗΚΕ) από τους [Δδ]ικαστές[:,]?\s*(.+?)Αρεοπαγίτες\.', 
        decision_text, 
        re.DOTALL
        )
        
        if judges_match:
                judges_names = judges_match.group(1)
                judges_names = re.sub(r'\([^)]*\)', '', judges_names) 
                
                titles_to_remove = [
                'Αντιπρόεδρο του Αρείου Πάγου,',
                'Προεδρεύουσα Αρεοπαγίτη,',
                'Προεδρεύουσα Αρεοπαγίτη ,',
                'Προεδρεύοντα Αρεοπαγίτη ,',
                'Προεδρεύοντα Αρεοπαγίτη,',
                'Εισηγητή,', 
                'Εισηγητή ,',
                'Εισηγητής,', 
                ]

                for title in titles_to_remove:
                        judges_names = judges_names.replace(title, '')
                
                judges_names = judges_names.replace(' και', ', ')
                judges_names = re.sub(r'\s+', ' ', judges_names).strip()
                judges_names = judges_names.rstrip(',')
        else:
                judges_names = 'Unknown'

        def extract_section(start_phrase, end_phrases):
                pattern = rf"({re.escape(start_phrase)}\s*(?:.+?))(?=" + "|".join([re.escape(p) for p in end_phrases]) + "|$)"
                match = re.search(pattern, decision_text, re.DOTALL | re.IGNORECASE)
                if match:
                        content = match.group(1).strip()
                        content = re.sub(r'\s+', ' ', content)
                        return content
                else:
                        return "Unknown"

        court_section = extract_section(
                "ΤΟ ΔΙΚΑΣΤΗΡΙΟ ΤΟΥ ΑΡΕΙΟΥ ΠΑΓΟΥ", 
                ["ΣΚΕΦΘΗΚΕ ΣΥΜΦΩΝΑ ΜΕ ΤΟ ΝΟΜΟ", "ΓΙΑ ΤΟΥΣ ΛΟΓΟΥΣ ΑΥΤΟΥΣ"]
        )

        reasoning_section = extract_section(
                "ΣΚΕΦΘΗΚΕ ΣΥΜΦΩΝΑ ΜΕ ΤΟ ΝΟΜΟ", 
                ["ΓΙΑ ΤΟΥΣ ΛΟΓΟΥΣ ΑΥΤΟΥΣ"]
        )

        decision_section = extract_section(
                "ΓΙΑ ΤΟΥΣ ΛΟΓΟΥΣ ΑΥΤΟΥΣ", 
                ["Ο ΑΝΤΙΠΡΟΕΔΡΟΣ", "Η ΑΝΤΙΠΡΟΕΔΡΟΣ", "Η ΓΡΑΜΜΑΤΕΑΣ"]
        )

        return {
                'decision_number': number,
                'decision_year': year,
                'department_code': dept_code,
                'department_type': dept_type,
                'judges_names': judges_names,
                'court_section': court_section,
                'reasoning_section': reasoning_section,
                'decision_section': decision_section,
                'url': url
        }
driver = open_firefox()
url='https://areiospagos.gr/nomologia/apofaseis_DISPLAY.asp?cd=RWKDTA2X1554EH8YXW0M7NFLRRV23P&apof=11_2024&info=%CE%A0%CE%9F%CE%9B%CE%99%CE%A4%CE%99%CE%9A%CE%95%CE%A3'
pprint(extract_decision_details(driver,url))

In [None]:
def main():
        driver = open_firefox()
        try:
                html_content = search_decisions(driver, year="2024")
                decision_links = extract_decision_links(html_content)
                print(f"Fouund {len(decision_links)} decisions.")

                decisions_to_process = decision_links[:1]
                all_details = []

                for i, link in enumerate(decisions_to_process, 1):
                        print(f"Processing decision {i} / {len(decisions_to_process)}: {link['decision_id']}")
                        details = extract_decision_details(driver, link['url'])
                        pprint(details)
                        all_details.append(details)
                        time.sleep(1)
                # df = pd.DataFrame(all_details)
                # df.to_csv('areios_pagos_decisions.csv', index=False)
                # print('Data saved to areios_pagos_decisions.csv')
        finally:
                driver.quit()

if __name__ == "__main__":
        main()