# Web Crawler for PDF Downloading from Telekom Hilfe

 is designed to crawl the Telekom Hilfe website, identify all PDF links, and download these files to a specified directory. The script utilizes asynchronous programming with `aiohttp` and `BeautifulSoup` to efficiently process pages and handle large volumes of data. Logs are generated for tracking the crawling and downloading processes, ensuring traceability and error handling.

In [None]:
"""
This script performs a web crawl starting from a specified URL (Telekom Hilfe website) to find and download PDF files. 
It utilizes asynchronous programming for efficient web scraping and downloading, relying on the aiohttp and BeautifulSoup libraries. 
Downloaded PDFs are saved in a designated directory, with a logging setup to monitor the process and handle errors gracefully.
"""

import aiohttp
import asyncio
from bs4 import BeautifulSoup
import os
from urllib.parse import urljoin
import logging

# Directory to save downloaded PDF files
DOWNLOAD_DIR = 'data/pdf_files'
# Ensure the download directory exists
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

# Starting URL for the web crawl
START_URL = 'https://www.telekom.de/hilfe'

# Set to keep track of already downloaded PDF filenames to avoid duplicates
downloaded_files = set()

# Configure logger settings for information and error tracking
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

async def find_pdfs(url, session):
    """
    Fetches the HTML content of a page and identifies all links to PDF files.

    Args:
        url (str): The URL of the page to scrape.
        session (aiohttp.ClientSession): The aiohttp session used for making HTTP requests.

    Returns:
        list: A list of URLs that link to PDF files found on the page.
    """
    pdf_links = []
    try:
        async with session.get(url) as response:
            # Process only if the response content is HTML
            if 'text/html' in response.headers.get('Content-Type', '').lower():
                soup = BeautifulSoup(await response.text(), 'html.parser')
                # Extract links ending in .pdf
                pdf_links = [
                    urljoin(url, a['href'])
                    for a in soup.find_all('a', href=True)
                    if a['href'].lower().endswith('.pdf')
                ]
            else:
                logger.error(f'{url} is not an HTML page.')
    except Exception as e:
        logger.error(f'Failed to fetch {url}. Error: {str(e)}')
    
    return pdf_links

async def download_pdf(url, session):
    """
    Downloads a PDF file from the specified URL and saves it to the download directory.

    Args:
        url (str): The URL of the PDF file to download.
        session (aiohttp.ClientSession): The aiohttp session used for making HTTP requests.
    """
    filename = url.split('/')[-1]
    # Skip download if the file has already been processed
    if filename in downloaded_files:
        logger.info(f'Already downloaded: {filename}')
        return
    
    try:
        async with session.get(url) as response:
            # Proceed if the HTTP response status is 200 (OK)
            if response.status == 200:
                file_path = os.path.join(DOWNLOAD_DIR, filename)
                # Write the PDF file to disk
                with open(file_path, 'wb') as f:
                    f.write(await response.read())
                downloaded_files.add(filename)
                logger.info(f'Downloaded: {file_path}')
            else:
                logger.error(f'Failed to download {url}. Status code: {response.status}')
    except Exception as e:
        logger.error(f'Failed to download {url}. Error: {str(e)}')

async def crawl_site(start_url):
    """
    Initiates a web crawl from the starting URL, finds PDF links on each page, and downloads them.

    Args:
        start_url (str): The root URL to begin crawling.
    """
    urls_to_visit = {start_url}
    visited_urls = set()

    async with aiohttp.ClientSession() as session:
        while urls_to_visit:
            url = urls_to_visit.pop()
            # Skip if URL has already been visited
            if url in visited_urls:
                continue
            visited_urls.add(url)
            logger.info(f'Crawling: {url}')

            # Find and download PDFs from the current page
            pdf_links = await find_pdfs(url, session)
            for link in pdf_links:
                await download_pdf(link, session)

            try:
                async with session.get(url) as response:
                    # Proceed if the response status is 200 (OK)
                    if response.status == 200:
                        soup = BeautifulSoup(await response.text(), 'html.parser')
                        # Add new URLs within the same domain to visit
                        urls_to_visit.update(
                            urljoin(url, a['href'])
                            for a in soup.find_all('a', href=True)
                            if a['href'].startswith('/') and urljoin(url, a['href']) not in visited_urls
                        )
            except Exception as e:
                logger.error(f'Failed to crawl {url}. Error: {str(e)}')

async def main():
    """
    The main function that initiates the web crawling and PDF downloading process.
    """
    await crawl_site(START_URL)

def run_async(coro):
    """
    Executes an asynchronous coroutine using the event loop, adjusting for whether the loop is already running.

    Args:
        coro (coroutine): The coroutine function to execute.
    """
    loop = asyncio.get_event_loop()
    if loop.is_running():
        # If the event loop is active, schedule the coroutine
        asyncio.ensure_future(coro)
    else:
        # If the event loop is inactive, run the coroutine until completion
        loop.run_until_complete(coro)

# Start the script
if __name__ == '__main__':
    run_async(main())

# PDF Filter

filters PDF files by checking for specific keywords related to privacy and legal information on the first page. Files that pass the filter are copied into a separate directory. The script handles encryption, image-only pages, and unreadable PDFs, ensuring a reliable dataset for further analysis.

### First filter

In [None]:
"""
This script filters PDF files based on specific keywords found on the first page of each document. 
The primary purpose is to identify and copy PDFs that do not contain any of the specified keywords 
related to privacy and legal information, into a separate directory. 

Directory Structure:
- `data/pdf_files`: The source directory containing the original PDFs to be processed.
- `data/keyword_filter`: The destination directory for PDFs that pass the filtering criteria.

Filtering Criteria:
The script checks each PDF file to see if the first page contains any of the specified keywords 
(e.g., "privacy", "legal notice", etc.). If a keyword is found, the file is excluded. Additionally, 
PDFs that are encrypted, have image-only pages, or cannot be read are also excluded from copying.

Requirements:
- pypdf: For PDF processing and text extraction from pages.
"""

import os
import shutil
from pypdf import PdfReader
from pathlib import Path

# Define directories for source and filtered PDFs
pdf_dir = "data/pdf_files"
first_filter_dir = "data/keyword_filter"

# Define keywords for filtering PDF files
filter_keywords = [
    "Datenschutzhinweis", "Datenschutzhinweise", "Datenschutzrichtlinie", "Datenschutz", "Data Privacy", "Data privacy", 
    "Data privacy information", "Ergänzende Bedingungen", "End-User License", "Firmware-Änderungen", 
    "Firmwareänderungen", "Firmware", "Geschäftsbedingungen", "Konformitätserklärung", "LEGAL NOTICE", 
    "LIZENZTEXTE", "LICENSES", "LIZENZ", "LICENCE", "privacy", "Privacy", "RECHTLICHE HINWEISE"
]

# Ensure the filtered directory exists
Path(first_filter_dir).mkdir(parents=True, exist_ok=True)

def check_first_page(pdf_path):
    """
    Checks the first page of a PDF to determine if it meets the criteria for filtering.
    The function performs the following:
    - Decrypts the PDF if it is encrypted.
    - Extracts text from the first page and checks if it is image-only.
    - Searches for specific keywords on the first page.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        bool: True if the PDF passes the filter criteria (no keywords found), False otherwise.
    """
    try:
        reader = PdfReader(pdf_path)

        # Attempt to decrypt the PDF if encrypted
        if reader.is_encrypted:
            try:
                reader.decrypt("")  # Attempt decryption with an empty password
                print(f"Decrypted PDF: {pdf_path}")
            except Exception as e:
                print(f"Could not decrypt {pdf_path}: {e}")
                return False

        # Extract the first page
        first_page = reader.pages[0]

        # Check if the first page contains images but no extractable text
        has_images = bool(first_page.images)
        first_page_text = first_page.extract_text()

        if has_images and not first_page_text:
            print(f"Image-only page detected in {pdf_path}")
            return False

        if not first_page_text:
            print(f"Unreadable or image-only first page in {pdf_path}")
            return False

        # Check for the presence of any filter keywords in the first page text
        for keyword in filter_keywords:
            if keyword in first_page_text:
                return False  # Keyword found, exclude this file

        return True  # No keywords found, file passes the filter

    except Exception as e:
        print(f"Error reading {pdf_path}: {e}")
        return False  # Return False if any exception occurs (unreadable or inaccessible file)

# Process each PDF in the source directory
for file_name in os.listdir(pdf_dir):
    if file_name.endswith(".pdf"):
        file_path = os.path.join(pdf_dir, file_name)
        
        # Check if the PDF meets the filtering criteria
        if check_first_page(file_path):
            # Define the destination path in the filtered directory
            destination_path = os.path.join(first_filter_dir, file_name)
            
            # Copy the file if it does not already exist in the filtered directory
            if not os.path.exists(destination_path):
                shutil.copy(file_path, destination_path)
                print(f"Copied: {file_name}")
            else:
                print(f"Skipped (already exists): {file_name}")

# Output the count of successfully filtered files
filtered_files = os.listdir(first_filter_dir)
print(f"Number of files in the filtered directory: {len(filtered_files)}")

# PDF Keyword Filter

 filters PDF files from a specified directory based on the presence of predefined keywords. If a PDF file's name or the text on its first page includes any of the target keywords, the entire file is copied to a target directory.

In [None]:
# Description: This script searches for PDF files in a specified directory that contain specific keywords.
# If a PDF file contains one of the target keywords either in its filename or on the first page,
# the script copies the entire PDF to a target directory for further use. 
# This is especially useful for filtering large PDF datasets by content relevance.

import os
from pypdf import PdfReader, PdfWriter
from pathlib import Path

# Directories
pdf_dir = "data/keyword_filter"      # Directory containing PDF files to be filtered
target_dir = "data/magenta_files"     # Target directory to save matched PDF files

# Keywords to search for in PDF filenames or content
keywords = ["magenta"]

# Ensure the target directory exists; create if it does not
Path(target_dir).mkdir(parents=True, exist_ok=True)

# Function to check if any keyword is present in the given text
def contains_keyword(text, keywords):
    """
    Checks if the given text contains any of the specified keywords.
    
    Args:
        text (str): The text to search within.
        keywords (list of str): List of keywords to check for in the text.
        
    Returns:
        bool: True if any keyword is found in the text, False otherwise.
    """
    text = text.lower()
    return any(keyword.lower() in text for keyword in keywords)

# Function to save all pages of a PDF to a new PDF file
def save_all_pages(pdf_path, destination_path):
    """
    Copies all pages from the source PDF file to a new PDF file.
    
    Args:
        pdf_path (str): Path to the source PDF file.
        destination_path (str): Path to save the new PDF file.
    """
    reader = PdfReader(pdf_path)
    writer = PdfWriter()

    # Add all pages from the source PDF to the new PDF
    for page in reader.pages:
        writer.add_page(page)

    # Write the new PDF to the destination path
    with open(destination_path, "wb") as output_pdf:
        writer.write(output_pdf)

# Iterate through each file in the PDF directory to check for keywords
for file_name in os.listdir(pdf_dir):
    if file_name.endswith(".pdf"):
        file_path = os.path.join(pdf_dir, file_name)
        
        # Check if the file name contains any of the target keywords
        if contains_keyword(file_name, keywords):
            destination_path = os.path.join(target_dir, file_name)
            # Copy the PDF file to the target directory
            save_all_pages(file_path, destination_path)
            print(f"Copied (by filename match): {file_name}")
            continue  # Skip further checks for this file since it's already matched

        # Check the content of the first page for keywords if filename does not match
        try:
            reader = PdfReader(file_path)

            # Check if the PDF is encrypted and attempt to decrypt it
            if reader.is_encrypted:
                try:
                    reader.decrypt("")  # Attempt to decrypt with an empty password
                except Exception as e:
                    print(f"Unable to decrypt: {file_path}: {e}")
                    continue

            # Extract and check text from the first page
            first_page = reader.pages[0]
            first_page_text = first_page.extract_text()

            # Check if the first page contains any of the keywords
            if first_page_text and contains_keyword(first_page_text, keywords):
                destination_path = os.path.join(target_dir, file_name)
                # Save the entire PDF if the first page contains a keyword
                save_all_pages(file_path, destination_path)
                print(f"Copied (by first page content): {file_name}")

        except Exception as e:
            print(f"Error reading PDF: {file_path}: {e}")

# Print the number of files in the target directory
filtered_files = os.listdir(target_dir)
print(f"Number of files in target directory: {len(filtered_files)}")

Kopyalandı (dosya adı ile): magentamobil-speedbox-young.pdf
Kopyalandı (ilk sayfa ile): konfiguration-zyxel-speedlink-5501.pdf
Kopyalandı (ilk sayfa ile): inbetriebnahme-router-mit-reset.pdf
Kopyalandı (dosya adı ile): magentamobil-speedbox-flex-young.pdf
Kopyalandı (dosya adı ile): magentamobil-xl.pdf
Kopyalandı (dosya adı ile): magentamobil-speedbox-flex.pdf
Kopyalandı (dosya adı ile): magentamobil-xl-flex.pdf
Kopyalandı (ilk sayfa ile): inbetriebnahme-frtzbox-mit-reset.pdf
Kopyalandı (dosya adı ile): magentamobil-xl-young.pdf
Kopyalandı (dosya adı ile): magentamobil-special-m-flex.pdf
Kopyalandı (dosya adı ile): magentamobil-basic.pdf
Kopyalandı (dosya adı ile): magentamobil-data-s.pdf
Kopyalandı (dosya adı ile): magentazuhause-xl.pdf
Kopyalandı (dosya adı ile): magentamobil-pluskarte-flex.pdf
Kopyalandı (dosya adı ile): installationsanleitung-mein-magenta-app.pdf
Kopyalandı (dosya adı ile): magentamobil-special-m-eins.pdf
Kopyalandı (dosya adı ile): magentamobil-s-flex-young.pdf
Ko



Kopyalandı (dosya adı ile): magentamobil-prepaid-m.pdf
Kopyalandı (ilk sayfa ile): esim-profil-apple-watch.pdf




Kopyalandı (dosya adı ile): magentamobil-prepaid-l.pdf
Kopyalandı (dosya adı ile): magentamobil-special-m-flex-young.pdf
Kopyalandı (dosya adı ile): bedienungsanleitung-magenta-tv-stick.pdf




Kopyalandı (ilk sayfa ile): inbetriebnahme-router-ohne-reset.pdf
Kopyalandı (dosya adı ile): magentamobil-prepaid-max.pdf
Kopyalandı (dosya adı ile): kurzbedienungsanleitung-magenta-tv-box.pdf
Kopyalandı (dosya adı ile): magentamobil-xl-flex-young.pdf
Kopyalandı (dosya adı ile): magentamobil-m-young.pdf
Kopyalandı (dosya adı ile): konfiguration-magentazuhause-regio-zyxel-speedlink-6501.pdf
Kopyalandı (ilk sayfa ile): checkliste-neuer-hausanschluss.pdf
Kopyalandı (ilk sayfa ile): esim-aktivierung-apple-watch.pdf
Kopyalandı (dosya adı ile): magentamobil-s-young.pdf




Kopyalandı (dosya adı ile): magentamobil-m.pdf
Kopyalandı (dosya adı ile): magentamobil-prepaid-xl.pdf
Kopyalandı (dosya adı ile): magentamobil-special-m-young.pdf




Kopyalandı (dosya adı ile): MagentaTV_2.0.pdf
Kopyalandı (dosya adı ile): magentamobil-prepaid-s.pdf
Kopyalandı (dosya adı ile): magentamobil-l.pdf
Kopyalandı (dosya adı ile): magentamobil-s-flex.pdf
Kopyalandı (ilk sayfa ile): inbetriebnahme-frtzbox-ohne-reset.pdf
Kopyalandı (dosya adı ile): magentamobil-xl-premium.pdf
Kopyalandı (dosya adı ile): magentamobil-basic-flex.pdf
Kopyalandı (ilk sayfa ile): wechselmatrix.pdf
Kopyalandı (ilk sayfa ile): lte-sofort.pdf
Kopyalandı (dosya adı ile): magentamobil-l-young.pdf
Kopyalandı (ilk sayfa ile): kurzbedienungsanleitung-media-reiceiver-401.pdf
Kopyalandı (dosya adı ile): magentamobil-xs.pdf
Kopyalandı (dosya adı ile): magentamobil-m-flex-young.pdf
Kopyalandı (dosya adı ile): magentamobil-l-flex-young.pdf




Kopyalandı (ilk sayfa ile): unterstuetzte-kameras.pdf
Kopyalandı (dosya adı ile): magentamobil-prepaid-jahrestarif.pdf
Kopyalandı (dosya adı ile): magentamobil-data-l.pdf
Kopyalandı (dosya adı ile): magentamobil-data-m.pdf
Kopyalandı (dosya adı ile): magentamobil-m-flex.pdf
Kopyalandı (dosya adı ile): magentamobil-l-flex.pdf
Kopyalandı (dosya adı ile): manuelle-konfiguration-digitalisierungsbox-magentazuhause-regio.pdf
Hedef klasördeki dosya sayısı: 59


# PDF Text Extraction and Summarization

 extracts and summarizes text from PDF files. It converts PDF pages to images, applies OCR for text extraction, and uses GPT-4 Vision to generate concise summaries. Outputs are saved as text files.

In [None]:
"""
This script automates the process of extracting text data from PDF files, specifically from instructional and informational content related to telecommunication products.
The primary steps include converting PDF pages to images, performing Optical Character Recognition (OCR) to extract text, and then using the extracted data to query 
OpenAI's GPT-4 Vision API for a more refined extraction and summarization of relevant information. 

The script:
1. Converts PDF files to images (one image per page).
2. Applies OCR to extract textual content from each image.
3. Sends the image data to GPT-4 Vision to extract key instructions and details.
4. Saves the extracted and summarized information to text files.

Requirements:
- OpenAI API key in a `.env` file (using dotenv for secure access).
- PaddleOCR library for text extraction from images.
- pdf2image for converting PDF pages to images.
"""

import os
from openai import OpenAI
from dotenv import load_dotenv
from pdf2image import convert_from_path
from paddleocr import PaddleOCR
import base64

# Load environment variables from .env file for secure API access
load_dotenv()

# Initialize OpenAI API client with API key
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"))

# Define directories for PDF inputs and text outputs
pdf_dir = "data/magenta_files"      # Directory containing the PDF files
output_dir = "data/text_outputs"     # Directory to store output text files
os.makedirs(output_dir, exist_ok=True)  # Ensure output directory exists

# Initialize PaddleOCR with English language support
ocr = PaddleOCR(use_angle_cls=True, lang='en')

# Function to convert an image file to a base64 encoded string
def image_to_base64(image_path):
    """
    Converts an image to a base64 encoded string for use in API requests.
    
    Args:
        image_path (str): Path to the image file.

    Returns:
        str: Base64 encoded string of the image.
    """
    with open(image_path, "rb") as img_file:
        return base64.b64encode(img_file.read()).decode('utf-8')

# Function to extract text from PDF and send to GPT-4 Vision API
def extract_and_send_to_gpt4(pdf_path):
    """
    Extracts text content from each page of a PDF file using OCR and sends it to the GPT-4 Vision API for 
    summarization and extraction of key details.

    Args:
        pdf_path (str): Path to the PDF file.

    Returns:
        str: Compiled textual content extracted and processed from the PDF.
    """
    images = convert_from_path(pdf_path)  # Convert PDF pages to images
    text_content = []

    for i, image in enumerate(images):
        # Save each image page temporarily for OCR processing
        image_path = f"temp_page_{i}.png"
        image.save(image_path, "PNG")
        
        # Perform OCR on the image to extract textual information
        ocr_result = ocr.ocr(image_path, cls=True)

        # Parse OCR result to collect text data from the image
        extracted_text = ""
        for line in ocr_result:
            for word_info in line:
                extracted_text += word_info[1][0] + " "
            extracted_text += "\n"  # Line break for readability
        
        if extracted_text:
            # Send extracted text data to GPT-4 Vision for further processing
            response = extract_with_gpt4_vision(image_path)
            text_content.append(response)  # Append GPT-4 Vision response to text content

        # Remove temporary image file to conserve storage
        os.remove(image_path)

    return "\n".join(text_content)

# Function to interact with GPT-4 Vision API using image data
def extract_with_gpt4_vision(image_path):
    """
    Sends a base64-encoded image to the GPT-4 Vision API for analysis and extracts a summarized response.

    Args:
        image_path (str): Path to the image file.

    Returns:
        str: Summarized text from GPT-4 Vision API response.
    """
    # Convert image to base64 format for embedding in the API request
    img_b64_str = image_to_base64(image_path)

    print(f"Calling GPT-4 Vision API with image: {image_path}")  # Informative log message

    # API call to OpenAI GPT-4 Vision for image analysis
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Model used for vision processing
        messages=[
            {
                "role": "user",
                "content": [
                    {
                        "type": "text",
                        "text": (
                            "Please analyze the following image. "
                            "The image contains instructions, diagrams, and other informative content related to telecommunication products and devices. "
                            "Extract and summarize the relevant information, including instructions and any important details that might be useful."
                        )
                    },
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": f"data:image/png;base64,{img_b64_str}"
                        },
                    },
                ],
            }
        ]
    )
    
    # Extract and return the content of the API response
    return response.choices[0].message.content

# Main loop to process each PDF file in the input directory
for file_name in os.listdir(pdf_dir):
    if file_name.endswith(".pdf"):
        pdf_path = os.path.join(pdf_dir, file_name)  # Full path to the PDF file
        output_path = os.path.join(output_dir, f"{os.path.splitext(file_name)[0]}.txt")  # Output path for text file
        
        # Skip processing if the output file already exists
        if os.path.exists(output_path):
            print(f"Skipping {file_name}, output file already exists: {output_path}")
            continue

        try:
            # Extract text content from PDF and send to GPT-4 Vision
            text_content = extract_and_send_to_gpt4(pdf_path)

            # Write the processed text to the output file
            with open(output_path, 'w', encoding='utf-8') as output_file:
                output_file.write(text_content)

            print(f"Processed: {file_name}")  # Informative message for successful processing
        
        except Exception as e:
            # Log error message if processing fails
            print(f"Error processing {file_name}: {e}")

[2024/09/26 22:33:55] ppocr DEBUG: Namespace(help='==SUPPRESS==', use_gpu=False, use_xpu=False, use_npu=False, use_mlu=False, ir_optim=True, use_tensorrt=False, min_subgraph_size=15, precision='fp32', gpu_mem=500, gpu_id=0, image_dir=None, page_num=0, det_algorithm='DB', det_model_dir='/Users/taha/.paddleocr/whl/det/en/en_PP-OCRv3_det_infer', det_limit_side_len=960, det_limit_type='max', det_box_type='quad', det_db_thresh=0.3, det_db_box_thresh=0.6, det_db_unclip_ratio=1.5, max_batch_size=10, use_dilation=False, det_db_score_mode='fast', det_east_score_thresh=0.8, det_east_cover_thresh=0.1, det_east_nms_thresh=0.2, det_sast_score_thresh=0.5, det_sast_nms_thresh=0.2, det_pse_thresh=0, det_pse_box_thresh=0.85, det_pse_min_area=16, det_pse_scale=1, scales=[8, 16, 32], alpha=1.0, beta=1.0, fourier_degree=5, rec_algorithm='SVTR_LCNet', rec_model_dir='/Users/taha/.paddleocr/whl/rec/en/en_PP-OCRv4_rec_infer', rec_image_inverse=True, rec_image_shape='3, 48, 320', rec_batch_num=6, max_text_leng

# Text Categorisation
classifies and organizes text files into multiple categories based on content relevance. 


In [None]:
# It uses GPT-4o-mini to analyze text files in an input directory, identify the three most relevant categories for each file, 
# and then copies each file to the corresponding category folders within an output directory.
# Dependencies: Ensure the environment contains required libraries and .env file with API key.

# Required Libraries
import os
import shutil
from dotenv import load_dotenv
from langchain_openai import ChatOpenAI
from langchain.schema import HumanMessage

# Example of how to run the function for processing a directory
input_directory = "data/text_outputs"
output_directory = "/Users/taha/Desktop/rag/data"

load_dotenv()  # Load environment variables from a .env file

# Retrieve API keys from environment variables
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

# Initialize the chat model
model = ChatOpenAI(model="gpt-4o-mini", api_key=OPENAI_API_KEY)

# Define the mapping of category descriptions
category_map = {
    "Vertrag & Rechnung": "vertrag_rechnung_ihre_daten_kundencenter_login-daten_rechnung_lieferstatus",
    "Hilfe bei Störungen": "hilfe_stoerungen_stoerungen_selbst_beheben_melden_status_verfolgen",
    "Mobilfunk": "mobilfunk_tarife_optionen_mobiles-internet_mailbox_esim_sim-karten",
    "Internet & Telefonie": "internet_telefonie:_ausbau,_sicherheit,_einstellungen,_bauherren,_glasfaser_und_wlan",
    "TV": "tv_magentatv_streaming-dienste_magentatv_jugendschutz_pins",
    "MagentaEINS": "magentains_kombi-pakete_mit_magentains_vorteil_und_treuebonus",
    "Apps & Dienste": "apps_dienste_e-mail_magenta_apps_voicemail_app_mobilityconnect",
    "Geräte & Zubehör": "geraete_zubehoer_anleitungen_fuer_smartphones_tablets_telefone_router_receiver"
}

# List of valid folder names
folder_names = list(category_map.keys())

def classify_with_gpt(content, model, category_map):
    """
    Classifies the content into the three most relevant categories using GPT-4.

    Parameters:
    - content (str): The text content of the file to be classified.
    - model (ChatOpenAI): The GPT-4o-mini model instance.
    - category_map (dict): A dictionary with category descriptions.

    Returns:
    - tuple of str: The three most relevant category names determined by GPT-4.
    """
    categories_with_descriptions = [f"{name}: {desc}" for name, desc in category_map.items()]
    categories_text = "\n".join(categories_with_descriptions)
    
    prompt = (
        f"Classify the following text into the three most relevant categories based on the descriptions:\n\n"
        f"{categories_text}\n\n"
        f"Text:\n\"{content}\"\n\n"
        f"Which three categories does this text fit into? Reply with only the three category names, separated by a comma."
    )

    try:
        response = model([HumanMessage(content=prompt)])
        category_response = response.content.strip()

        # Parse and return the three categories
        categories = [cat.strip() for cat in category_response.split(",")]
        if len(categories) == 3 and all(cat in folder_names for cat in categories):
            return categories[0], categories[1], categories[2]
        else:
            print(f"Unrecognized categories returned: {category_response}")
            return folder_names[0], folder_names[1], folder_names[2]  # Default fallback
    except Exception as e:
        print(f"Error during classification: {e}")
        return folder_names[0], folder_names[1], folder_names[2]  # Default fallback

def read_in_chunks(file_path, chunk_size=8191):
    """
    Reads a file in chunks to ensure that no single read exceeds the token limit.

    Parameters:
    - file_path (str): The path to the file to be read.
    - chunk_size (int): The size of each chunk to read (default is 8191 characters).

    Yields:
    - str: A chunk of text from the file.
    """
    with open(file_path, "r", encoding="utf-8") as file:
        while True:
            chunk = file.read(chunk_size)
            if not chunk:
                break
            yield chunk

def classify_file(file_path, model, category_map):
    """
    Classifies a file's content by reading in chunks and finding the three most relevant categories.

    Parameters:
    - file_path (str): The path to the file to be classified.
    - model (ChatOpenAI): The GPT-4o-mini model instance.
    - category_map (dict): A dictionary with category descriptions.

    Returns:
    - tuple of str: The three most relevant categories for the file.
    """
    categories_count = {cat: 0 for cat in folder_names}
    
    for chunk in read_in_chunks(file_path):
        try:
            first_category, second_category, third_category = classify_with_gpt(chunk, model, category_map)
            categories_count[first_category] += 1
            categories_count[second_category] += 1
            categories_count[third_category] += 1
        except Exception as e:
            print(f"Error processing chunk: {e}")

    # Sort categories by count and select the top three
    sorted_categories = sorted(categories_count, key=categories_count.get, reverse=True)
    return sorted_categories[0], sorted_categories[1], sorted_categories[2]

def copy_to_three_categories(file_path, output_directory, model, category_map):
    """
    Classifies a file's content and copies it to the three most relevant categories.
    Also, prepends "pdf_" to the file name.

    Parameters:
    - file_path (str): Path to the file.
    - output_directory (str): Path to the main output directory where files should be organized.
    - model (ChatOpenAI): The GPT-4o-mini model instance.
    - category_map (dict): A dictionary with category descriptions.
    """
    try:
        # Classify the file into three most relevant categories
        first_category, second_category, third_category = classify_file(file_path, model, category_map)

        filename = os.path.basename(file_path)
        new_filename = "pdf_" + filename

        # Copy to the first category folder
        first_category_dir = os.path.join(output_directory, first_category)
        os.makedirs(first_category_dir, exist_ok=True)
        shutil.copy(file_path, os.path.join(first_category_dir, new_filename))
        print(f"Copied {filename} to {first_category_dir}")

        # Copy to the second category folder
        second_category_dir = os.path.join(output_directory, second_category)
        os.makedirs(second_category_dir, exist_ok=True)
        shutil.copy(file_path, os.path.join(second_category_dir, new_filename))
        print(f"Copied {filename} to {second_category_dir}")

        # Copy to the third category folder
        third_category_dir = os.path.join(output_directory, third_category)
        os.makedirs(third_category_dir, exist_ok=True)
        shutil.copy(file_path, os.path.join(third_category_dir, new_filename))
        print(f"Copied {filename} to {third_category_dir}")

    except Exception as e:
        print(f"Error processing file {file_path}: {e}")

def process_files_in_directory(input_directory, output_directory, category_map, model):
    """
    Processes all .txt files in the input directory, classifies them, and copies them to three relevant categories.

    Parameters:
    - input_directory (str): Path to the directory containing .txt files.
    - output_directory (str): Path to the main output directory where files should be organized.
    - category_map (dict): A dictionary with category descriptions.
    - model (ChatOpenAI): The GPT-4o-mini model instance.
    """
    for filename in os.listdir(input_directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(input_directory, filename)
            copy_to_three_categories(file_path, output_directory, model, category_map)

process_files_in_directory(input_directory, output_directory, category_map, model)

  response = model([HumanMessage(content=prompt)])


Copied magentamobil-s-flex-young.txt to /Users/taha/Desktop/rag/data/Vertrag & Rechnung
Copied magentamobil-s-flex-young.txt to /Users/taha/Desktop/rag/data/Mobilfunk
Copied manuelle-konfiguration-magentazuhause-regio-zyxel-speedlink-5501.txt to /Users/taha/Desktop/rag/data/Internet & Telefonie
Copied manuelle-konfiguration-magentazuhause-regio-zyxel-speedlink-5501.txt to /Users/taha/Desktop/rag/data/Hilfe bei Störungen
Copied magentazuhause-xl.txt to /Users/taha/Desktop/rag/data/Vertrag & Rechnung
Copied magentazuhause-xl.txt to /Users/taha/Desktop/rag/data/Internet & Telefonie
Copied magentamobil-pluskarte-flex.txt to /Users/taha/Desktop/rag/data/Vertrag & Rechnung
Copied magentamobil-pluskarte-flex.txt to /Users/taha/Desktop/rag/data/Mobilfunk
Copied installationsanleitung-mein-magenta-app.txt to /Users/taha/Desktop/rag/data/Vertrag & Rechnung
Copied installationsanleitung-mein-magenta-app.txt to /Users/taha/Desktop/rag/data/Apps & Dienste
Copied magentamobil-special-m-eins.txt to /