# PDF Web Scraper Notebook


This notebook provides a script for crawling a website to find and download PDF files. The script utilizes asynchronous HTTP requests for efficient web scraping and file downloading.

The script performs the following tasks:
1. **Crawls a specified website** to find all PDF links.
2. **Downloads the PDF files** and saves them to a local directory (`data/pdf_files/`).
3. **Logs the progress and errors** encountered during the process.


In [None]:
import aiohttp
import asyncio
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
import logging

# Directory to save downloaded PDF files
DOWNLOAD_DIR = 'data/pdf_files'
# Create the directory if it does not exist
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Starting URL for crawling
START_URL = 'https://www.telekom.de/hilfe'

# Set to keep track of downloaded PDF filenames
downloaded_files = set()

# Configure logger settings for better traceability
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Function to find PDF links on a given URL
async def find_pdfs(url, session):
    """
    Fetches the content of a page and finds all PDF links.
    
    Args:
        url (str): The URL of the page to fetch.
        session (aiohttp.ClientSession): The aiohttp session used for making HTTP requests.
        
    Returns:
        list: A list of URLs pointing to PDF files.
    """
    pdf_links = []
    try:
        async with session.get(url) as response:
            # Only process if the response is HTML
            if 'text/html' in response.headers.get('Content-Type', '').lower():
                soup = BeautifulSoup(await response.text(), 'html.parser')
                # Extract all links ending with .pdf
                pdf_links = [
                    urljoin(url, a['href'])
                    for a in soup.find_all('a', href=True)
                    if a['href'].lower().endswith('.pdf')
                ]
            else:
                logger.error(f'{url} is not an HTML page.')
    except Exception as e:
        logger.error(f'Failed to fetch {url}. Error: {str(e)}')
    
    return pdf_links

# Function to download a PDF file
async def download_pdf(url, session):
    """
    Downloads a PDF file from the given URL and saves it to the local directory.
    
    Args:
        url (str): The URL of the PDF file to download.
        session (aiohttp.ClientSession): The aiohttp session used for making HTTP requests.
    """
    filename = url.split('/')[-1]
    # Skip download if file has already been downloaded
    if filename in downloaded_files:
        logger.info(f'Already downloaded: {filename}')
        return
    
    try:
        async with session.get(url) as response:
            # Only process if the response status is OK
            if response.status == 200:
                file_path = os.path.join(DOWNLOAD_DIR, filename)
                # Save the PDF file to disk
                with open(file_path, 'wb') as f:
                    f.write(await response.read())
                downloaded_files.add(filename)
                logger.info(f'Downloaded: {file_path}')
            else:
                logger.error(f'Failed to download {url}. Status code: {response.status}')
    except Exception as e:
        logger.error(f'Failed to download {url}. Error: {str(e)}')

# Function to crawl the website and find PDF links
async def crawl_site(start_url):
    """
    Crawls the website starting from the given URL, finds PDF links, and downloads them.
    
    Args:
        start_url (str): The starting URL for the crawl.
    """
    urls_to_visit = {start_url}
    visited_urls = set()

    async with aiohttp.ClientSession() as session:
        while urls_to_visit:
            url = urls_to_visit.pop()
            # Skip URLs that have already been visited
            if url in visited_urls:
                continue
            visited_urls.add(url)
            logger.info(f'Crawling: {url}')

            # Find and download PDF links on the current page
            pdf_links = await find_pdfs(url, session)
            for link in pdf_links:
                await download_pdf(link, session)

            try:
                async with session.get(url) as response:
                    # Only process if the response status is OK
                    if response.status == 200:
                        soup = BeautifulSoup(await response.text(), 'html.parser')
                        # Add new URLs to visit that are within the starting URL and not visited yet
                        urls_to_visit.update(
                            urljoin(url, a['href'])
                            for a in soup.find_all('a', href=True)
                            if a['href'].startswith('/') and urljoin(url, a['href']) not in visited_urls
                        )
            except Exception as e:
                logger.error(f'Failed to crawl {url}. Error: {str(e)}')

# Entry point for the script
async def main():
    """
    Main function to start the web crawling process.
    """
    await crawl_site(START_URL)

# Function to run an asynchronous coroutine
def run_async(coro):
    """
    Runs an asynchronous coroutine using the current event loop.
    
    Args:
        coro (coroutine): The coroutine to run.
    """
    loop = asyncio.get_event_loop()
    if loop.is_running():
        # If the event loop is already running, schedule the coroutine
        asyncio.ensure_future(coro)
    else:
        # If the event loop is not running, run the coroutine until complete
        loop.run_until_complete(coro)

# Run the script
if __name__ == '__main__':
    run_async(main())


## Clean and filter PDF Files

### First filter

In [None]:
import os
import shutil
from pypdf import PdfReader
from pathlib import Path

# Directories
pdf_dir = "data/pdf_files"
first_filter_dir = "data/keyword_filter"

# Keywords to filter by
filter_keywords = [
    "Datenschutzhinweis", "Datenschutzhinweise", "Datenschutzrichtlinie", "Datenschutz", "Data Privacy", "Data privacy", "Data privacy information",
    "Ergänzende Bedingungen", "End-User License", "Firmware-Änderungen", "Firmwareänderungen", "Firmware", "Geschäftsbedingungen",
    "Konformitätserklärung", "LEGAL NOTICE", "LIZENZTEXTE", "LICENSES", "LIZENZ", "LICENCE" "privacy", "Privacy", "RECHTLICHE HINWEISE"
]

# Create the filtered directory if it doesn't exist
Path(first_filter_dir).mkdir(parents=True, exist_ok=True)

# Function to check if the page has text, if it's an image, or other errors
def check_first_page(pdf_path):
    try:
        reader = PdfReader(pdf_path)

        # Check if the PDF is encrypted
        if reader.is_encrypted:
            try:
                # Try to decrypt with an empty password (sometimes PDFs don't need a password)
                reader.decrypt("")
                print(f"Decrypted PDF: {pdf_path}")
            except Exception as e:
                print(f"Could not decrypt {pdf_path}: {e}")
                return False

        first_page = reader.pages[0]

        # Check if the page has any images (image-based PDF)
        has_images = bool(first_page.images)

        # Extract text from the first page
        first_page_text = first_page.extract_text()

        # If the page has images but no text, consider it image-only
        if has_images and not first_page_text:
            print(f"Image-only page detected in {pdf_path}")
            return False

        # If no extractable text, treat it as unreadable
        if not first_page_text:
            print(f"Unreadable or image-only first page in {pdf_path}")
            return False

        # Check for any of the keywords in the first page text
        for keyword in filter_keywords:
            if keyword in first_page_text:
                return False

        return True

    except Exception as e:
        # Handle cases where the file cannot be read or is encrypted
        print(f"Error reading {pdf_path}: {e}")
        return False

# Iterate over the files in the pdf directory
for file_name in os.listdir(pdf_dir):
    if file_name.endswith(".pdf"):
        file_path = os.path.join(pdf_dir, file_name)
        
        # Check if the PDF should be included based on the first page content
        if check_first_page(file_path):
            # Destination path
            destination_path = os.path.join(first_filter_dir, file_name)
            
            # Copy file if it doesn't already exist in the first_filter folder
            if not os.path.exists(destination_path):
                shutil.copy(file_path, destination_path)
                print(f"Copied: {file_name}")
            else:
                print(f"Skipped (already exists): {file_name}")

# Count and print the number of files in the filtered directory
filtered_files = os.listdir(first_filter_dir)
print(f"Number of files in the filtered directory: {len(filtered_files)}")

### Reading remaining complex PDFs

#### Ilk önce burada direkt PDF lerden metni alan klasik yontemleri kullaniyorum
#### suraya refer ediyorum: https://pub.towardsai.net/advanced-rag-02-unveiling-pdf-parsing-b84ae866344e 
#### tezde karsilastirma yapmak icin bu kodu tutuyorum burda

In [1]:
import os
import pdfplumber
from unstructured.partition.pdf import partition_pdf
from unstructured.documents.elements import Table, Image as UnstructuredImage
import layoutparser as lp
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import openai
from dotenv import load_dotenv  # Import the load_dotenv function

# Load environment variables from .env file
load_dotenv()

# Set up your OpenAI API key
openai.api_key = os.getenv("OPENAI_API_KEY")

# Define directories
pdf_dir = "data/keyword_filter"
output_dir = "data/text_outputs"
os.makedirs(output_dir, exist_ok=True)

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Function to extract text from simpler PDFs using pdfplumber
def extract_with_pdfplumber(pdf_path):
    text_content = []
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            if text:
                text_content.append(("text", text))
    return text_content

# Function to extract tables and text using Unstructured
def extract_with_unstructured(pdf_path):
    elements = partition_pdf(pdf_path)
    text_content = []
    
    for element in elements:
        if isinstance(element, Table):
            text_content.append(("table", "\n".join([str(row) for row in element.rows])))
        elif isinstance(element, UnstructuredImage):
            text_content.append(("image", element))
        else:
            text_content.append(("text", str(element)))
    
    return text_content

# Function to extract text using Layout-Parser and PaddleOCR for complex PDFs
def extract_with_layout_and_ocr(pdf_path):
    images = convert_from_path(pdf_path)
    text_content = []
    
    for i, image in enumerate(images):
        # Save page image for OCR
        image_path = f"temp_page_{i}.png"
        image.save(image_path, "PNG")
        
        # Perform OCR
        ocr_result = ocr.ocr(image_path, cls=True)
        for line in ocr_result:
            text_content.append(("ocr", ' '.join([word_info[1][0] for word_info in line])))

        # Clean up temp image
        os.remove(image_path)

    return text_content

# Function to extract information using GPT-4 Vision (advanced)
def extract_with_gpt4_vision(image_path):
    with open(image_path, "rb") as img_file:
        img_data = img_file.read()

    # Call the OpenAI GPT API with the image data
    response = openai.Image.create(
        file=img_data,
        model="gpt-4.0-turbo"
    )
    return response['choices'][0]['text']

# Main extraction loop
for file_name in os.listdir(pdf_dir):
    if file_name.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, file_name)
        output_path = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}.txt")

        try:
            # Initialize text content list
            text_content = []
            
            # Try rule-based extraction first
            text_content += extract_with_pdfplumber(pdf_path)
            if not any(content[0] == "text" for content in text_content):  # If no text is found, try Unstructured
                text_content += extract_with_unstructured(pdf_path)
            if not any(content[0] == "text" for content in text_content):  # If still no text, use layout parser and OCR
                text_content += extract_with_layout_and_ocr(pdf_path)

            # Check for images and apply GPT-4 Vision if needed
            for i, item in enumerate(text_content):
                if item[0] == "image":
                    image_path = f"temp_image_{i}.png"
                    item[1].to_image().save(image_path)  # Save the image for GPT-4 Vision
                    vision_text = extract_with_gpt4_vision(image_path)
                    text_content[i] = ("vision", vision_text)  # Replace image info with vision analysis
                    os.remove(image_path)

            # Write the extracted content to the output file
            with open(output_path, 'w', encoding='utf-8') as output_file:
                for content_type, content in text_content:
                    output_file.write(f"{content_type.upper()}: {content}\n\n")  # Format for clarity

            print(f"Processed: {file_name}")
        
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

[2024/09/25 12:39:28] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/taha/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/taha/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_leng

Exception ignored in: <bound method IPythonKernel._clean_thread_parent_frames of <ipykernel.ipkernel.IPythonKernel object at 0x1054a30e0>>
Traceback (most recent call last):
  File "/Users/taha/Desktop/rag/venv/lib/python3.12/site-packages/ipykernel/ipkernel.py", line 775, in _clean_thread_parent_frames
    def _clean_thread_parent_frames(

KeyboardInterrupt: 


Processed: benutzerhandbuch-erweiterte-konfiguration-digitalisierungsbox-premium-2.pdf
Processed: kurzanl_tpx721_tk_11.01.pdf
Processed: routing-modus-dt-version-zyxel-vmg1312-b30a.pdf
Processed: data-sim.pdf
Processed: bedienungsanleitung_alcatel_lucent_8232_06.2012.pdf
Processed: einrichtung-companyflex-pbx-mode-digitalsierungsbox-basic.pdf
Processed: bedienungsanleitung_eumex_402_stand_27022014.pdf


#### Tamamen OpenAI GPT Vision kullanarak tek tek Pdf'lerin özet bilgisini aliyorum burada.

In [4]:
import os
from openai import OpenAI
from dotenv import load_dotenv
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
import base64

# Load environment variables from .env file
load_dotenv()

# Set up your OpenAI API key
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define directories
pdf_dir = "data/keyword_filter"
output_dir = "data/text_outputs"
os.makedirs(output_dir, exist_ok=True)

# Initialize PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Function to convert an image to a base64 string
def image_to_base64(image_path):
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode('utf-8')

# Function to extract tables using OCR and send to GPT-4 Vision
def extract_and_send_to_gpt4(pdf_path):
    images = convert_from_path(pdf_path)
    text_content = []

    for i, image in enumerate(images):
        # Save the image for processing
        image_path = f"temp_page_{i}.png"
        image.save(image_path, "PNG")
        
        # Use PaddleOCR to extract data from the image
        ocr_result = ocr.ocr(image_path, cls=True)

        # Collect extracted text and other data from OCR results
        extracted_text = ""
        for line in ocr_result:
            for word_info in line:
                extracted_text += word_info[1][0] + " "
            extracted_text += "\n"
        
        if extracted_text:
            # Send the extracted data to GPT-4 Vision
            response = extract_with_gpt4_vision(image_path)
            text_content.append(response)

        # Clean up the image file
        os.remove(image_path)

    return "\n".join(text_content)

# Function to extract information using GPT-4 Vision
def extract_with_gpt4_vision(image_path):
    # Convert the image to a base64 string
    img_b64_str = image_to_base64(image_path)

    print(f"Calling GPT-4 Vision API with image: {image_path}")  # Ekrana bilgi yazdır

    # OpenAI API çağrısını uygun şekilde güncelleyelim
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Güncel model adını kullanıyoruz
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": (
                            "Please analyze the following image. "
                            "The image contains instructions, diagrams, and other informative content related to telecommunication devices. "
                            "Extract and summarize the relevant information, including instructions and any important details that might be useful."
                        )
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{img_b64_str}"
                        },
                    },
                ],
            }
        ]
    )
    
    # Cevabı almak için doğru yöntemi kullanalım
    return response.choices[0].message.content  # Bu kısım düzeltildi

# Main extraction loop
for file_name in os.listdir(pdf_dir):
    if file_name.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, file_name)
        output_path = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}.txt")
        
        # Check if the output file already exists
        if os.path.exists(output_path):
            print(f"Skipping {file_name}, output file already exists: {output_path}")
            continue  # Skip processing if output file exists

        try:
            # Extract information and send to GPT-4 Vision
            text_content = extract_and_send_to_gpt4(pdf_path)

            # Write the extracted content to the output file
            with open(output_path, 'w', encoding='utf-8') as output_file:
                output_file.write(text_content)

            print(f"Processed: {file_name}")
        
        except Exception as e:
            print(f"Error processing {file_name}: {e}")

[2024/09/25 12:36:04] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/taha/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/taha/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_leng