In [None]:
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.firefox.options import Options
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By

from bs4 import BeautifulSoup
from pprint import pprint

import pandas as pd
import logging
import json
import time
import re
import os

def open_firefox():
        firefox_options = Options()
        firefox_options.add_argument('--headless')
        driver_path = r'C:\Program Files\geckodriver-v0.35.0-win64\geckodriver.exe'
        service = Service(driver_path)
        driver = webdriver.Firefox(service=service, options=firefox_options)
        return driver

def search_decisions(driver, year="2024"):
        driver.get("https://areiospagos.gr/nomologia/apofaseis.asp")
        wait = WebDriverWait(driver, 10)

        # fill 2024 as the year to be searched
        year_input = wait.until(expected_conditions.presence_of_element_located((By.NAME, 'x_ETOS')))
        year_input.clear()
        year_input.send_keys(year)

        # select 'ΟΛΕΣ' stis 'Αποφάσεις'
        select_tmhma = Select(driver.find_element(By.NAME, 'X_TMHMA'))
        select_tmhma.select_by_value('6')

        # select 'ΟΛΕΣ' sto 'Τμήμα'
        select_sub_tmhma = Select(driver.find_element(By.NAME, 'X_SUB_TMHMA'))
        select_sub_tmhma.select_by_value('1')

        search_button = driver.find_element(By.NAME, 'submit_krit')
        search_button.click()

        time.sleep(2)
        # print(driver.page_source)
        return driver.page_source

# Usage
driver = open_firefox()
html_content = search_decisions(driver, "2024") 

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def extract_decision_links(driver, html_content):
    soup = BeautifulSoup(html_content, 'html.parser')
    links = []

    decision_links = soup.select('a.blue10_cursor[href^="apofaseis_DISPLAY.asp"]')
    total_links = len(decision_links)
    logging.info(f"Found {total_links} decision links to process.")

    for idx, a_tag in enumerate(decision_links, 1):
        try:
            decision_id = a_tag.text.strip()
            info = a_tag.get('href').split('info')[1] if 'info=' in a_tag.get('href') else ""
            href = a_tag.get('href')
            url = f"https://areiospagos.gr/nomologia/{href}"

            logging.info(f"[{idx}/{total_links}] Fetching decision ID: {decision_id} from URL: {url}")
            driver.get(url)
            time.sleep(1)  # optional delay
            html_source = driver.page_source

            links.append({
                'decision_id': decision_id,
                'info': info,
                'url': url,
                'html_source': html_source
            })

        except Exception as e:
            logging.error(f"Error fetching decision at index {idx}: {str(e)}")

    logging.info(f"Finished processing {len(links)} decision links.")
    return links

links = extract_decision_links(driver, html_content)
print(links)

output_filename = 'html_decisions.json'
with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(links, f, ensure_ascii=False, indent=2)

logging.info(f"Saved {len(links)} decisions to {output_filename}")

2025-05-03 23:06:34,537 - INFO - Found 2475 decision links to process.
2025-05-03 23:06:34,541 - INFO - [1/2475] Fetching decision ID: 11/2024 from URL: https://areiospagos.gr/nomologia/apofaseis_DISPLAY.asp?cd=RWKDTA2X1554EH8YXW0M7NFLRRV23P&apof=11_2024&info=ΠΟΛΙΤΙΚΕΣ -  Β1
2025-05-03 23:06:36,415 - INFO - [2/2475] Fetching decision ID: 12/2024 from URL: https://areiospagos.gr/nomologia/apofaseis_DISPLAY.asp?cd=N1RABGMSWJWJ7DHP4GZ8JI6Q4TFTMD&apof=12_2024&info=ΠΟΛΙΤΙΚΕΣ -  Β1
2025-05-03 23:06:37,765 - INFO - [3/2475] Fetching decision ID: 13/2024 from URL: https://areiospagos.gr/nomologia/apofaseis_DISPLAY.asp?cd=RSMY30KIH82MLOOC1VPLFAXXXSO0UO&apof=13_2024&info=ΠΟΛΙΤΙΚΕΣ -  Β1
2025-05-03 23:06:39,230 - INFO - [4/2475] Fetching decision ID: 14/2024 from URL: https://areiospagos.gr/nomologia/apofaseis_DISPLAY.asp?cd=ZU5LE0TC6T0315OGAQ6IMDLESK4ST3&apof=14_2024&info=ΠΟΛΙΤΙΚΕΣ -  Β1
2025-05-03 23:06:40,607 - INFO - [5/2475] Fetching decision ID: 31/2024 from URL: https://areiospagos.gr/nomo

In [None]:
def extract_decision_details(driver, url):
        driver.get(url)
        time.sleep(1)
        
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        decision_text = soup.get_text()

        decision_match = re.search(r'Απόφαση\s+(\d+)\s*/\s*(\d{4})', decision_text)
        if decision_match:
                number = decision_match.group(1)
                year = decision_match.group(2)
        else:
                number = "Unknown"
                year = "Unknown"
        
        dept_match = re.search(r'\((.*?),\s*(.*?)\)', decision_text)
        if dept_match:
                dept_code = dept_match.group(1)
                dept_type = dept_match.group(2)
                # Transform dept_type according to the requirements
                if "ΠΟΙΝΙΚΕΣ" in dept_type:
                        dept_type = "Ποινικό"
                elif "ΠΟΛΙΤΙΚΕΣ" in dept_type:
                        dept_type = "Πολιτικό"
        else:
                dept_match_alt = re.search(r'(Β\d+\'?\s*[Π|π]ολιτικό\s*Τμήμα)', decision_text)
                if dept_match_alt:
                        dept_parts = dept_match_alt.group(1).split()
                        dept_code = dept_parts[0] if dept_parts else "Unknown"
                        # Adjust the dept_type here too
                        dept_type = "Πολιτικό" if "πολιτικό" in dept_match_alt.group(1).lower() else "Unknown"
                else:
                        dept_code = "Unknown"
                        dept_type = "Unknown"

        # Rest of the function remains unchanged
        judges_match = re.search(
        r'(?:Συγκροτήθηκε|ΣΥΓΚΡΟΤΗΘΗΚΕ) από τους [Δδ]ικαστές[:,]?\s*(.+?)Αρεοπαγίτες\.', 
        decision_text, 
        re.DOTALL
        )
        
        if judges_match:
                judges_names = judges_match.group(1)
                judges_names = re.sub(r'\([^)]*\)', '', judges_names) 
                
                titles_to_remove = [
                'Αντιπρόεδρο του Αρείου Πάγου,',
                'Προεδρεύουσα Αρεοπαγίτη,',
                'Προεδρεύουσα Αρεοπαγίτη ,',
                'Προεδρεύοντα Αρεοπαγίτη ,',
                'Προεδρεύοντα Αρεοπαγίτη,',
                'Εισηγητή,', 
                'Εισηγητή ,',
                'Εισηγητής,', 
                ]

                for title in titles_to_remove:
                        judges_names = judges_names.replace(title, '')
                
                judges_names = judges_names.replace(' και', ', ')
                judges_names = re.sub(r'\s+', ' ', judges_names).strip()
                judges_names = judges_names.rstrip(',')
        else:
                judges_names = 'Unknown'

        def extract_section(start_phrases, end_phrases):
                if isinstance(start_phrases, str):
                        start_phrases = [start_phrases]
                
                pattern = "(" + "|".join([re.escape(p) for p in start_phrases]) + r")\s*(.+?)(?=" + "|".join([re.escape(p) for p in end_phrases]) + "|$)"
                match = re.search(pattern, decision_text, re.DOTALL | re.IGNORECASE)
                
                if match:
                        content = match.group(0).strip()
                        content = re.sub(r'\s+', ' ', content)
                        return content
                else:
                        return "Unknown"

        court_section = extract_section(
                ["ΤΟ ΔΙΚΑΣΤΗΡΙΟ ΤΟΥ ΑΡΕΙΟΥ ΠΑΓΟΥ","ΤΟ ΔΙΚΑΣΤΗ ΡΙΟ ΤΟΥ ΑΡΕΙΟΥ ΠΑΓΟΥ" ],
                ["ΣΚΕΦΘΗΚΕ ΣΥΜΦΩΝΑ ΜΕ ΤΟ ΝΟΜΟ", "ΓΙΑ ΤΟΥΣ ΛΟΓΟΥΣ ΑΥΤΟΥΣ"]
        )

        reasoning_section = extract_section(
                ["ΣΚΕΦΘΗΚΕ ΣΥΜΦΩΝΑ ΜΕ ΤΟ ΝΟΜΟ", "ΣΚΕΦΘΗΚΕ ΣΥΜΦΩΝΑ ΜΕ ΤΟΝ ΝΟΜΟ"], 
                ["ΓΙΑ ΤΟΥΣ ΛΟΓΟΥΣ ΑΥΤΟΥΣ"]
        )

        decision_section = extract_section(
                "ΓΙΑ ΤΟΥΣ ΛΟΓΟΥΣ ΑΥΤΟΥΣ", 
                ["Ο ΑΝΤΙΠΡΟΕΔΡΟΣ", "Η ΑΝΤΙΠΡΟΕΔΡΟΣ", "Η ΓΡΑΜΜΑΤΕΑΣ"]
        )
        
        def extract_articles(text):
                # mona arthra me diafores epishmanseis
                pattern1 = re.compile(
                        r"\b(άρθρ(?:ο|ου|α|ων)\s*\d+(?:[Α-Ωα-ω])?" #arthro, arithmos kai gramma
                        r"(?:\s+(?:παρ\.?|αριθ\.?|αρ\.?)\s*\d+)?" 
                        r"(?:\s*(?:εδ\.?|περ\.?|στοιχ\.?)\s*[α-ωΑ-Ω'\"]+)?"  
                        r"(?:\s+του\s+(?:Ν\.\s*\d+(?:/\d+)?|ΚΠολΔ|KΠολΔ|ΑΚ))?" #kwdikas
                        r")",
                        re.IGNORECASE
                )
                
                # piase ranges me ews
                pattern2 = re.compile(
                        r"\b(άρθρ(?:ο|ου|α|ων)\s*\d+(?:[Α-Ωα-ω])?\s*έως\s*\d+(?:[Α-Ωα-ω])?(?:\s+του\s+(?:Ν\.\s*\d+(?:/\d+)?|ΚΠολΔ|KΠολΔ|ΑΚ))?)",
                        re.IGNORECASE
                )
                
                # matsare listes arthrwn
                pattern3 = re.compile(
                        r"\b(άρθρ(?:ο|ου|α|ων)\s*\d+(?:[Α-Ωα-ω])?((?:\s*,\s*\d+(?:[Α-Ωα-ω])?)+(?:\s*,\s*\d+(?:[Α-Ωα-ω])?\s*(?:παρ\.?\s*\d+)?(?:\s*(?:εδ\.?|περ\.?|στοιχ\.?)\s*[α-ωΑ-Ω'\"]+)?)*(?:\s+(?:και|και του|του)\s+(?:\d+(?:[Α-Ωα-ω])?))?\s+του\s+(?:Ν\.\s*\d+(?:/\d+)?|ΚΠολΔ|KΠολΔ|ΑΚ))?)",
                        re.IGNORECASE
                )
                
        
                matches = []
                for pattern in [pattern1, pattern2, pattern3]:
                        matches.extend([match.group(0) for match in pattern.finditer(text)])
                
                cleaned_matches = [re.sub(r'\s+', ' ', m.strip()) for m in matches]
                
                seen = set()
                unique_matches = []
                for m in cleaned_matches:
                        if m not in seen:
                                unique_matches.append(m)
                                seen.add(m)
                
                return ", ".join(unique_matches) if unique_matches else "Unkown"
        
        articles_referenced = extract_articles(decision_text)

        return {
                'decision_number': number,
                'decision_year': year,
                'department_code': dept_code,
                'department_type': dept_type,
                'judges_names': judges_names,
                'court_section': court_section,
                'reasoning_section': reasoning_section,
                'decision_section': decision_section,
                'articles_referenced': articles_referenced, 
                'url': url
        }
driver = open_firefox()
url='https://www.areiospagos.gr/nomologia/apofaseis_DISPLAY.asp?cd=ANMYZNF6GXTTGKESQHKM6F5VUG2ELH&apof=1078_2024&info=%D0%CF%C9%CD%C9%CA%C5%D3%20-%20%20%C1%20%D0%EF%E9%ED.%20%C4%E9%E1%EA.'
details = extract_decision_details(driver,url)
pprint(details)
# pprint(details['articles_referenced'])
time.sleep(2)
driver.quit()