In [7]:
import os
import pytesseract
from pdf2image import convert_from_path
from PyPDF2 import PdfReader
from PIL import Image
import cv2
import numpy as np
import fitz  # PyMuPDF
from pdfminer.high_level import extract_text as pdfminer_extract_text
import unicodedata
import pandas as pd
import openai

# Set up OpenAI API key
openai.api_key = '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'

# Set the Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

# Function to preprocess images for better OCR accuracy
def preprocess_image(image):
    """
    Preprocesses an image to improve OCR accuracy.
    """
    # Convert to grayscale
    gray = cv2.cvtColor(np.array(image), cv2.COLOR_BGR2GRAY)
    
    # Apply Gaussian blur to reduce noise
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Apply adaptive thresholding to binarize the image
    binary = cv2.adaptiveThreshold(blurred, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)
    
    # Apply dilation to make text more prominent
    kernel = np.ones((2, 2), np.uint8)
    processed_image = cv2.dilate(binary, kernel, iterations=1)
    
    return Image.fromarray(processed_image)

# Function to normalize special characters
def normalize_text(text):
    """
    Normalizes special characters in the text.
    """
    return unicodedata.normalize('NFKC', text)

# Function to extract text using PyMuPDF
def extract_text_with_pymupdf(pdf_path):
    """
    Extracts text from a PDF using PyMuPDF.
    """
    text = ""
    try:
        doc = fitz.open(pdf_path)
        for page in doc:
            text += page.get_text()
    except Exception as e:
        print(f"Error extracting text with PyMuPDF from {pdf_path}: {e}")
    return text

# Function to extract text using pdfminer
def extract_text_with_pdfminer(pdf_path):
    """
    Extracts text from a PDF using pdfminer.
    """
    text = ""
    try:
        text = pdfminer_extract_text(pdf_path)
    except Exception as e:
        print(f"Error extracting text with pdfminer from {pdf_path}: {e}")
    return text

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path, languages='eng'):
    """
    Extracts text from a PDF file. Supports multiple languages for OCR.
    """
    text = ""
    try:
        # Try to extract text directly from the PDF using PyPDF2
        reader = PdfReader(pdf_path)
        for page in reader.pages:
            text += page.extract_text()
        
        # If no text is found, use OCR
        if not text.strip():
            try:
                # Specify the Poppler path
                poppler_path = r"C:\Program Files\poppler\poppler-0.89.0\bin"  # Update this path if necessary
                images = convert_from_path(pdf_path, poppler_path=poppler_path)
                print(f"Number of pages converted to images: {len(images)}")  # Debug statement
                
                for i, image in enumerate(images):
                    # Preprocess the image
                    processed_image = preprocess_image(image)
                    # Save the processed image for debugging
                    processed_image.save(f"debug_page_{i + 1}.png")
                    # Extract text using OCR with specified languages
                    page_text = pytesseract.image_to_string(processed_image, lang=languages)
                    text += page_text
                    print(f"Extracted text from page {i + 1}: {len(page_text)} characters")  # Debug statement
            except Exception as ocr_error:
                print(f"OCR failed for {pdf_path}: {ocr_error}")
                # Fallback to PyMuPDF
                text = extract_text_with_pymupdf(pdf_path)
                if not text.strip():
                    # Fallback to pdfminer
                    text = extract_text_with_pdfminer(pdf_path)
    except Exception as e:
        print(f"Error processing {pdf_path}: {e}")
    return text

# Function to process all PDFs in the input folder
def process_pdfs(input_folder, output_folder, languages='eng'):
    """
    Processes all PDFs in the input folder, extracts text, and saves the processed data.
    """
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    failed_files = []
    for filename in os.listdir(input_folder):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(input_folder, filename)
            print(f"Processing {filename}...")
            text = extract_text_from_pdf(pdf_path, languages=languages)
            
            if not text.strip():
                print(f"Warning: No text extracted from {filename}.")
                failed_files.append(filename)
                continue  # Skip saving empty files
            
            # Normalize special characters
            text = normalize_text(text)
            
            # Save the extracted text to a file in the output folder
            output_file = os.path.join(output_folder, filename.replace(".pdf", ".txt"))
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(text)
            print(f"Saved processed data to {output_file}")
    
    # Log failed files
    if failed_files:
        print("\nThe following files failed to extract text:")
        for file in failed_files:
            print(f"- {file}")

# Function to classify a document using OpenAI's GPT-3.5-turbo
def classify_document(text, filename):
    """
    Classifies a document into one of the three categories using OpenAI's GPT-3.5-turbo model.
    """
    # Summarize the text to fit within the token limit
    summarized_text = summarize_text(text)
    
    prompt = f"""
    Classify the following document into one of these categories:
    1. Financial Report
    2. Presentation
    3. Press Release

    Document Text:
    {summarized_text}

    Instructions:
    - Focus on the content and structure of the document.
    - A Financial Report typically contains financial data, tables, and performance metrics.
    - A Presentation usually includes slides, bullet points, and visual elements.
    - A Press Release is a formal announcement for the media, often containing quotes and news.

    Return only the category name (Financial Report, Presentation, or Press Release).
    """
    response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant that classifies documents."},
            {"role": "user", "content": prompt}
        ],
        max_tokens=10,
        temperature=0
    )
    print(response)
    return response.choices[0].message['content'].strip()

# Function to summarize text for classification
def summarize_text(text, start_chars=1000, end_chars=500):
    """
    Extracts the first `start_chars` and the last `end_chars` from the text to create a summary.
    Also includes any section that contains keywords related to the categories.
    """
    if len(text) <= (start_chars + end_chars):
        return text  # Return the full text if it's already short enough
    
    # Look for keywords in the text
    keywords = {
        "Financial Report": ["financial", "revenue", "profit", "loss", "balance sheet"],
        "Presentation": ["slide", "agenda", "bullet points", "visuals", "summary"],
        "Press Release": ["announce", "media", "news", "quote", "statement"]
    }
    
    # Include sections with keywords
    summary = text[:start_chars]
    for category, words in keywords.items():
        for word in words:
            if word.lower() in text.lower():
                start = text.lower().find(word.lower())
                summary += text[start:start + 500]  # Add 500 characters around the keyword
                break
    
    summary += text[-end_chars:]
    return summary

# Function to classify all processed documents and save results
def classify_and_save_results(output_folder, results_file):
    """
    Classifies all processed documents and saves the results to a CSV file.
    """
    results = []
    for filename in os.listdir(output_folder):
        if filename.endswith(".txt"):
            file_path = os.path.join(output_folder, filename)
            with open(file_path, "r", encoding="utf-8") as f:
                text = f.read()
            category = classify_document(text, filename)
            results.append({"Filename": filename, "Category": category})
    
    # Save results to a CSV file
    df = pd.DataFrame(results)
    df.to_csv(results_file, index=False)
    print(f"Results saved to {results_file}")

# Main pipeline
if __name__ == "__main__":
    input_folder = r"C:\Users\sumit\Documents\EQL\InputData"
    output_folder = r"C:\Users\sumit\Documents\EQL\ProcessedData"
    results_file = r"C:\Users\sumit\Documents\EQL\classification_results.csv"

    # Specify languages (e.g., English, French, and Sweaden)
    languages = 'eng+fra+chi_sim+swa+swe'

    # Step 1: Process PDFs and extract text
    process_pdfs(input_folder, output_folder, languages=languages)

    # Step 2: Classify documents and save results
    classify_and_save_results(output_folder, results_file)

Processing financial_report.1.pdf...
Saved processed data to C:\Users\sumit\Documents\EQL\ProcessedData\financial_report.1.txt
Processing financial_report.2.pdf...
Saved processed data to C:\Users\sumit\Documents\EQL\ProcessedData\financial_report.2.txt
Processing financial_report.3.pdf...
Saved processed data to C:\Users\sumit\Documents\EQL\ProcessedData\financial_report.3.txt
Processing financial_report.4.pdf...
Saved processed data to C:\Users\sumit\Documents\EQL\ProcessedData\financial_report.4.txt
Processing financial_report.5.pdf...
Saved processed data to C:\Users\sumit\Documents\EQL\ProcessedData\financial_report.5.txt
Processing presentation.1.pdf...
Saved processed data to C:\Users\sumit\Documents\EQL\ProcessedData\presentation.1.txt
Processing presentation.2.pdf...
Saved processed data to C:\Users\sumit\Documents\EQL\ProcessedData\presentation.2.txt
Processing presentation.3.pdf...
Saved processed data to C:\Users\sumit\Documents\EQL\ProcessedData\presentation.3.txt
Processi

{
  "id": "chatcmpl-AvUXH2iToVY1LWKSXgvKDQjG4oPmB",
  "object": "chat.completion",
  "created": 1738265439,
  "model": "gpt-3.5-turbo-0125",
  "choices": [
    {
      "index": 0,
      "message": {
        "role": "assistant",
        "content": "The document provided is classified as a Financial Report.",
        "refusal": null
      },
      "logprobs": null,
      "finish_reason": "length"
    }
  ],
  "usage": {
    "prompt_tokens": 742,
    "completion_tokens": 10,
    "total_tokens": 752,
    "prompt_tokens_details": {
      "cached_tokens": 0,
      "audio_tokens": 0
    },
    "completion_tokens_details": {
      "reasoning_tokens": 0,
      "audio_tokens": 0,
      "accepted_prediction_tokens": 0,
      "rejected_prediction_tokens": 0
    }
  },
  "service_tier": "default",
  "system_fingerprint": null
}
{
  "id": "chatcmpl-AvUXIuKtXZOo5Z2400O6otWASO96c",
  "object": "chat.completion",
  "created": 1738265440,
  "model": "gpt-3.5-turbo-0125",
  "choices": [
    {
      "inde