In [None]:
import requests
from bs4 import BeautifulSoup
import PyPDF2
from io import BytesIO

BASE_URL = "https://ncn.gov.pl"

def fetch_html(url):
    """Fetch and return the HTML content from the given URL."""
    response = requests.get(url)
    response.raise_for_status()
    return response.text

def parse_projects(html):
    """Parse HTML content to extract projects from the nz group."""
    soup = BeautifulSoup(html, 'html.parser')
    # Find all <tr> tags that have class including 'opus-nz'
    rows = soup.find_all('tr', class_=lambda x: x and 'opus-nz' in x)
    projects = []
    
    for row in rows:
        # Identify the project group from the row classes (e.g. opus-nz1, opus-nz2)
        classes = row.get('class')
        group = next((cls for cls in classes if cls.startswith('opus-nz')), None)
        
        # Extract the project title and PDF link from the <td class="polski">
        td_polski = row.find('td', class_='polski')
        if td_polski:
            a_tag = td_polski.find('a')
            if a_tag:
                title = a_tag.get_text(strip=True)
                pdf_link = a_tag.get('href')
                # Convert relative URL to absolute URL if needed
                if pdf_link.startswith('/'):
                    pdf_link = BASE_URL + pdf_link
                projects.append({
                    'group': group,
                    'title': title,
                    'pdf_link': pdf_link
                })
    return projects

def fetch_pdf_content(pdf_url):
    """Download the PDF file from pdf_url and extract its text content."""
    response = requests.get(pdf_url)
    response.raise_for_status()
    
    with BytesIO(response.content) as f:
        reader = PyPDF2.PdfReader(f)
        # Extract text from each page
        content = ""
        for page in reader.pages:
            page_text = page.extract_text() or ""
            content += page_text
    return content


html_url = "https://ncn.gov.pl/sites/default/files/listy-rankingowe/2024-03-15-oppr4giwi8/opus.html"
html_content = fetch_html(html_url)
projects = parse_projects(html_content)
print("projects", projects)

for project in projects:
    print(f"Group: {project['group']}")
    print(f"Title: {project['title']}")
    print(f"PDF Link: {project['pdf_link']}")
    
    # Fetch and extract PDF content (print only the first 500 characters for preview)
    try:
        pdf_text = fetch_pdf_content(project['pdf_link'])
        project['pdf_content'] = pdf_text
        print("PDF Content Preview:")
        print(pdf_text[:500])
    except Exception as e:
        print(f"Error fetching PDF: {e}")
    print("-" * 80)