In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
pip install pytesseract PyMuPDF opencv-python python-Levenshtein

In [None]:
import requests
import time
import random
import os

def safe_pdf_downloader(urls, save_dir="downloaded_pdfs", delay_range=(5, 15), proxies=None):
    """
    Safely download multiple PDFs with anti-ban measures
    
    Args:
        urls (list): List of PDF URLs to download
        save_dir (str): Directory to save PDFs (default: 'downloaded_pdfs')
        delay_range (tuple): Min/max delay between requests in seconds (default: 5-15)
        proxies (dict): Optional proxies for request rotation
    """
    # Common desktop User-Agents (expand this list)
    user_agents = [
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/121.0.0.0 Safari/537.3",
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 14_3_1) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/17.2 Safari/605.1.1",
        "Mozilla/5.0 (Windows NT 10.0; rv:122.0) Gecko/20100101 Firefox/122.0",
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.3"
    ]

    # Create save directory if not exists
    os.makedirs(save_dir, exist_ok=True)

    for i, url in enumerate(urls):
        try:
            # Generate random delay (except first request)
            if i > 0:
                delay = random.uniform(*delay_range)
                print(f"Waiting {delay:.1f} seconds before next request...")
                time.sleep(delay)

            # Get filename from URL
            filename = url.split("/")[-1]
            save_path = os.path.join(save_dir, filename)

            # Skip existing files
            if os.path.exists(save_path):
                print(f"Skipping existing file: {filename}")
                continue

            # Rotate headers and proxies
            headers = {
                "User-Agent": random.choice(user_agents),
                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                "Accept-Language": "en-US,en;q=0.5",
            }

            print(f"Downloading ({i+1}/{len(urls)}): {filename}")
            
            # Stream download with timeout
            response = requests.get(
                url,
                headers=headers,
                proxies=proxies,
                stream=True,
                timeout=20
            )
            
            # Check for 4xx/5xx errors
            response.raise_for_status()

            # Save content
            with open(save_path, "wb") as f:
                for chunk in response.iter_content(chunk_size=8192):
                    if chunk:  # filter out keep-alive chunks
                        f.write(chunk)

            print(f"Successfully saved: {save_path}")

        except requests.exceptions.HTTPError as e:
            if e.response.status_code == 429:
                print("Rate limited - consider increasing delays or using proxies")
                return  # Abort on rate limit
            print(f"HTTP Error {e.response.status_code} for {url}")
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {url}: {str(e)}")
        except Exception as e:
            print(f"Unexpected error: {str(e)}")

if __name__ == "__main__":
    # Example usage
    pdf_urls = [
        "https://www.resmigazete.gov.tr/eskiler/2025/01/20250121-27.pdf",
        # Add more URLs here
    ]

    safe_pdf_downloader(
        pdf_urls,
        save_dir="resmi_gazete_pdfs",
        delay_range=(10, 30),  # More conservative delays
        # proxies={"http": "http://10.10.1.10:3128"}  # Uncomment to use proxies
    )

In [None]:
!wget -P /usr/share/tesseract-ocr/4.00/tessdata/ https://github.com/tesseract-ocr/tessdata_best/raw/main/tur.traineddata


In [None]:
import fitz  # PyMuPDF
import cv2
import numpy as np
import pytesseract
from pytesseract import Output
import re

# 1. INSTALL FIRST
# - Tesseract: https://github.com/UB-Mannheim/tesseract/wiki
# - Turkish language data: `tur.traineddata` in Tesseract's tessdata folder
# - Install packages: pip install pytesseract PyMuPDF opencv-python

def pdf_to_images(pdf_path):
    """Convert PDF pages to images"""
    doc = fitz.open(pdf_path)
    images = []
    
    for page in doc:
        pix = page.get_pixmap(dpi=300)  # High DPI for better OCR
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, 3))
        images.append(img)
    
    return images

def preprocess_image(img):
    """Enhance image for better OCR results"""
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    denoised = cv2.fastNlMeansDenoising(gray, h=30)
    scaled = cv2.resize(denoised, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
    _, threshold = cv2.threshold(scaled, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    return threshold

def ocr_turkish(image):
    """Perform OCR on preprocessed image with Turkish language"""
    custom_config = r'--oem 3 --psm 6 -l tur'
    details = pytesseract.image_to_data(image, output_type=Output.DICT, config=custom_config)
    return details

def parse_legal_document(text):
    """Parse Turkish legal text structure"""
    # Example patterns for Resmi Gazete
    patterns = {
        'date': r'\d{2}\.\d{2}\.\d{4}',
        'law_number': r'(?:Kanun|Yönetmelik)\s+N[oö]\.?\s*[\d-]+',
        'section': r'(?:Madde|MADDE)\s+\d+',
        'paragraph': r'\(\d+\)'
    }
    
    results = {}
    for key, pattern in patterns.items():
        matches = re.findall(pattern, text, re.IGNORECASE)
        if matches:
            results[key] = matches
            
    return results

def process_scanned_pdf(pdf_path):
    images = pdf_to_images(pdf_path)
    all_text = []
    
    for idx, img in enumerate(images):
        print(f"Processing page {idx+1}/{len(images)}")
        processed = preprocess_image(img)
        ocr_result = ocr_turkish(processed)
        
        # Combine text lines
        page_text = ' '.join([word for word in ocr_result['text'] if word.strip()])
        parsed = parse_legal_document(page_text)
        
        all_text.append({
            'page': idx+1,
            'raw_text': page_text,
            'parsed': parsed
        })
    
    return all_text

# Usage
result = process_scanned_pdf("/kaggle/input/yargtay/20250121-27.pdf")

# Print sample results
for page in result:
    print(f"\nPage {page['page']}:")
    print("Found sections:", page['parsed'].get('section', []))
    print("Found dates:", page['parsed'].get('date', []))
    print("Sample text:", page['raw_text'][:500] + "...")

In [None]:
!sudo apt install tesseract-ocr
!sudo apt install libtesseract-dev
!wget https://github.com/tesseract-ocr/tessdata/raw/main/tur.traineddata
!mkdir -p /usr/share/tesseract-ocr/4.00/tessdata/
!mv tur.traineddata /usr/share/tesseract-ocr/4.00/tessdata/

import fitz
import cv2
import numpy as np
import pytesseract
from pytesseract import Output
import re
import os

# Set Tesseract path explicitly for Kaggle
pytesseract.pytesseract.tesseract_cmd = '/usr/bin/tesseract'
os.environ['TESSDATA_PREFIX'] = '/usr/share/tesseract-ocr/4.00/tessdata/'

def pdf_to_images(pdf_path):
    """Convert PDF pages to images"""
    doc = fitz.open(pdf_path)
    images = []
    
    for page in doc:
        pix = page.get_pixmap(dpi=300)
        img = np.frombuffer(pix.samples, dtype=np.uint8).reshape((pix.height, pix.width, 3))
        images.append(img)
    
    return images

def preprocess_image(img):
    """Enhanced preprocessing for Kaggle environment"""
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    
    # Improved denoising
    denoised = cv2.fastNlMeansDenoising(
        gray,
        h=30,
        templateWindowSize=7,
        searchWindowSize=21
    )
    
    # Contrast Limited Adaptive Histogram Equalization
    clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
    enhanced = clahe.apply(denoised)
    
    # Sharpening
    kernel = np.array([[0, -1, 0], [-1, 5,-1], [0, -1, 0]])
    sharpened = cv2.filter2D(enhanced, -1, kernel)
    
    return sharpened

def ocr_turkish(image):
    """Kaggle-optimized Turkish OCR"""
    custom_config = r'--oem 3 --psm 6 -l tur --tessdata-dir /usr/share/tesseract-ocr/4.00/tessdata/'
    try:
        return pytesseract.image_to_data(image, output_type=Output.DICT, config=custom_config)
    except Exception as e:
        print(f"OCR Error: {str(e)}")
        return {'text': []}

# Rest of your existing functions remain the same...

# Usage
result = process_scanned_pdf("/kaggle/input/yargtay/20250121-27.pdf")

In [None]:
result

In [None]:
def save_ocr_results(result, output_dir="ocr_results"):
    # Create output directory
    os.makedirs(output_dir, exist_ok=True)
    
    for page_data in result:
        filename = f"page_{page_data['page']}.txt"
        filepath = os.path.join(output_dir, filename)
        
        with open(filepath, "w", encoding="utf-8") as f:
            f.write(f"=== Page {page_data['page']} ===\n")
            f.write("RAW TEXT:\n")
            f.write(page_data['raw_text'] + "\n\n")
            f.write("STRUCTURED DATA:\n")
            for key, values in page_data['parsed'].items():
                f.write(f"{key.upper()}: {', '.join(values)}\n")
                
        print(f"Saved: {filepath}")

# Usage
save_ocr_results(result)

In [None]:
pip install FPDF

In [None]:
from fpdf import FPDF

def create_searchable_pdf(result, output_path="searchable.pdf"):
    # Initialize PDF with UTF-8 support
    pdf = FPDF()
    pdf.set_auto_page_break(auto=True, margin=15)
    
    # Add Unicode-compatible font (make sure this font supports Turkish characters)
    pdf.add_font("DejaVu", "", "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf", uni=True)
    pdf.set_font("DejaVu", size=12)
    
    for page_data in result:
        pdf.add_page()
        # Page number header
        pdf.cell(0, 10, f"Sayfa {page_data['page']}", ln=1)
        # Add UTF-8 encoded text
        try:
            pdf.multi_cell(0, 10, txt=page_data['raw_text'])
        except UnicodeEncodeError:
            # Fallback for problematic characters
            cleaned_text = page_data['raw_text'].encode('latin-1', 'replace').decode('latin-1')
            pdf.multi_cell(0, 10, txt=cleaned_text)
    
    pdf.output(output_path)
    print(f"Oluşturulan PDF: {output_path}")

# First install required fonts in Kaggle:
!apt-get update -qq && apt-get install -y fonts-dejavu

# Then run
create_searchable_pdf(result)

In [None]:
import os

# Check if file exists
if os.path.exists("searchable.pdf"):
    print("File exists. Size:", os.path.getsize("searchable.pdf"), "bytes")
else:
    print("File not created!")

# List all files in directory
print("\nFiles in current folder:")
print(os.listdir("."))

In [None]:
from IPython.display import FileLink

# Force file download
FileLink("searchable.pdf")