### __OpenGov Platform: Same-sex marriage__

#### Libraries

In [1]:
import requests
import lxml
import json 
import os
from bs4 import BeautifulSoup 
import pandas as pd
import asyncio
import aiohttp
import logging 
import time 

#### Functions

In [None]:
# initialize the log file
def load_log(log_file_path):
    log = {}
    if os.path.exists(log_file_path):
        with open(log_file_path, "r") as f:
            for line in f:
                parts = line.strip().split(",")
                if len(parts) == 2:
                    p_val, last_cpage = parts
                    log[int(p_val)] = int(last_cpage)
    else:
        print("incorrect path")
    return log

# update the log file
def update_log(p_val, last_cpage):
    log[p_val] = last_cpage
    with open(log_file_path, "w") as f:
        for key, val in log.items():
            f.write(f"{key},{val}\n")

# page scraping logic
def scrape_comments(p_val, page, max_retries=3):
    full_url = f"https://www.opengov.gr/ypep/?p={p_val}&cpage={page}#comments"
    headers = {"User-Agent": "Mozilla/5.0"}

    retries = 0
    while retries < max_retries:
        try:
            response = requests.get(full_url, headers=headers)
            if response.status_code == 429:
                print(f"[429] Rate limited on p={p_val}, page={page}. Sleeping for 60s...")
                time.sleep(60)
                retries += 1
                continue
            elif response.status_code >= 400:
                print(f"[{response.status_code}] Error on p={p_val}, page={page}. Aborting this page.")
                return None # early failure

            soup = BeautifulSoup(response.text, "html.parser")
            comments = soup.find_all("li", class_="comment")
            if not comments:
                return False # no comments on this page

            all_page_comments = []
            for comment in comments:
                author = comment.find("div", class_="author")
                author_name = author.find("strong").get_text(strip=True) if author else "Unknown"
                date_published = author.get_text(strip=True).split("|")[0] if author else "Unknown"
                text = comment.find("p")
                article_text = text.get_text(strip=True) if text else "No text available"
                permalink = comment.find("a", class_="permalink")
                comment_url = permalink["href"] if permalink else "No URL"

                comment_data = {
                    "data_type": "Comment in 'opengov.gr' under the curriculum's content",
                    "author_name": author_name,
                    "date_published": date_published,
                    "article_text": article_text,
                    "URL": comment_url,
                    "page_found": page
                }
                all_page_comments.append(comment_data)

            return all_page_comments

        except requests.RequestException as e:
            print(f"[ERROR] Network error on p={p_val}, page={page}: {e}. Retrying...")
            time.sleep(10)
            retries += 1

    print(f"[FAIL] Max retries reached for p={p_val}, page={page}. Skipping.")
    return None

log_file_path = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath("opengov_scrapers.ipynb")))) + "\\outputs\\logs\\scrape_log_opengov_same_sex.txt"
log = load_log(log_file_path)

#### Process

In [None]:
# URLs to scrape
site_list = [
    "https://www.opengov.gr/ypep/?p=847",
    "https://www.opengov.gr/ypep/?p=846",
    "https://www.opengov.gr/ypep/?p=845",
    "https://www.opengov.gr/ypep/?p=844",
    "https://www.opengov.gr/ypep/?p=843",
    "https://www.opengov.gr/ypep/?p=842",
    "https://www.opengov.gr/ypep/?p=841",
    "https://www.opengov.gr/ypep/?p=840",
    "https://www.opengov.gr/ypep/?p=839",
    "https://www.opengov.gr/ypep/?p=838",
    "https://www.opengov.gr/ypep/?p=837",
    "https://www.opengov.gr/ypep/?p=836",
    "https://www.opengov.gr/ypep/?p=835",
]

#### DEBUG LINES ####
# MAX_PAGES_PER_RUN = 2  # Only scrape 2 comment pages for testing
#### DEBUG LINES ####

In [None]:
for url in site_list:
    try:
        p_val = int(url.split("?p=")[-1])
    except ValueError:
        print(f"Invalid URL format: {url}")
        continue

    start_page = log.get(p_val, 1)
    current_page = start_page
    site_comments = []

    print(f"\n🔍 Scraping p={p_val} from comment page {start_page}...")

    pages_scraped = 0

    while True:

        # ### DEBUG LINES
        # if pages_scraped >= MAX_PAGES_PER_RUN:
        #     print(f"Test cutoff: reached MAX_PAGES_PER_RUN for p={p_val}")
        #     break
        # ### DEBUG LINES

        page_comments = scrape_comments(p_val, current_page)

        if page_comments is None:
            print(f"Stopping early for p={p_val} due to repeated failures or rate limits.")
            break

        elif page_comments:
            site_comments.extend(page_comments)
            print(f"Scraped {len(page_comments)} comments from p={p_val}, page {current_page}")
            current_page += 1
            # ### DEBUG LINES
            # pages_scraped += 1
            # ### DEBUG LINES
            update_log(p_val, current_page)

        else:
            print(f"No more comments at p={p_val}, page {current_page}.")
            break

    # save
    if site_comments:
        end_page = current_page - 1
        output_filename = f"opengov_comments_p{p_val}_pages_{start_page}-{end_page}.json"
        with open(output_filename, "w", encoding="utf-8") as f:
            json.dump(site_comments, f, ensure_ascii=False, indent=2)
        print(f"Saved {len(site_comments)} comments to {output_filename}")
    else:
        print(f"No new comments scraped for p={p_val}")
