In [1]:
print("🔧 CELL 1: Importing libraries...")
import os
import time
import cv2

print("✅ Basic imports successful")
print(f"📁 Current directory: {os.getcwd()}")
print(f"📄 Files in directory: {[f for f in os.listdir('.') if f.endswith(('.jpg', '.png', '.pdf'))]}")


🔧 CELL 1: Importing libraries...
✅ Basic imports successful
📁 Current directory: c:\Users\alila\Desktop\PaddleOCR
📄 Files in directory: ['temp_page.png', 'test_small_resume.png']


In [3]:
print("🔧 CELL 2: Testing image loading...")

image_path = "pdfs/Ali_Lazraq.jpg"

# Check if file exists
if os.path.exists(image_path):
    print(f"✅ File found: {image_path}")
    
    # Get file size
    file_size = os.path.getsize(image_path) / (1024 * 1024)  # MB
    print(f"📊 File size: {file_size:.2f} MB")
    
    # Try to load with OpenCV
    img = cv2.imread(image_path)
    if img is not None:
        print(f"✅ Image loaded successfully: {img.shape}")
        print(f"📐 Image dimensions: {img.shape[1]} x {img.shape[0]} pixels")
        print(f"📊 Image channels: {img.shape[2]}")
        
        # Check if image is too large
        total_pixels = img.shape[0] * img.shape[1]
        print(f"📊 Total pixels: {total_pixels:,}")
        
        if total_pixels > 2000000:  # 2 million pixels
            print("⚠️ WARNING: Large image detected - this might cause memory issues")
        
    else:
        print("❌ Failed to load image with OpenCV")
else:
    print(f"❌ File not found: {image_path}")
    print("Please update the image_path variable")

🔧 CELL 2: Testing image loading...
✅ File found: pdfs/Ali_Lazraq.jpg
📊 File size: 0.86 MB
✅ Image loaded successfully: (1650, 1275, 3)
📐 Image dimensions: 1275 x 1650 pixels
📊 Image channels: 3
📊 Total pixels: 2,103,750


In [4]:
print("🔧 CELL 3: Testing PaddleOCR import...")

try:
    from paddleocr import PaddleOCR
    print("✅ PaddleOCR imported successfully")
    
    # Check if we can access the class
    print(f"📦 PaddleOCR class: {PaddleOCR}")
    
except ImportError as e:
    print(f"❌ Failed to import PaddleOCR: {e}")
except Exception as e:
    print(f"❌ Unexpected error importing PaddleOCR: {e}")

🔧 CELL 3: Testing PaddleOCR import...
✅ PaddleOCR imported successfully
📦 PaddleOCR class: <class 'paddleocr._pipelines.ocr.PaddleOCR'>


In [None]:
print("🔧 CELL 4: Creating small test image...")

try:
    import numpy as np
    from PIL import Image, ImageDraw, ImageFont
    
    # Create a small test image with text
    test_img = Image.new('RGB', (400, 200), color='white')
    draw = ImageDraw.Draw(test_img)
    
    # Try to use a default font, fallback to basic if not available
    try:
        font = ImageFont.load_default()
    except:
        font = None
    
    # Draw some text
    draw.text((50, 50), "JOHN DOE", fill='black', font=font)
    draw.text((50, 80), "Software Engineer", fill='black', font=font)
    draw.text((50, 110), "Email: john@example.com", fill='black', font=font)
    draw.text((50, 140), "Phone: (555) 123-4567", fill='black', font=font)
    
    # Save test image
    test_image_path = "test_small_resume.png"
    test_img.save(test_image_path)
    print(f"✅ Created small test image: {test_image_path}")
    
    # Verify it loads with OpenCV
    test_cv_img = cv2.imread(test_image_path)
    if test_cv_img is not None:
        print(f"✅ Test image loads with OpenCV: {test_cv_img.shape}")
    else:
        print("❌ Test image failed to load with OpenCV")
        
except Exception as e:
    print(f"❌ Failed to create test image: {e}")

🔧 CELL 4: Creating small test image...
✅ Created small test image: test_small_resume.png
✅ Test image loads with OpenCV: (200, 400, 3)


In [6]:
print("🔧 CELL 5: Initializing PaddleOCR...")
print("⏳ This might take a few minutes for first-time setup...")

start_time = time.time()

try:
    # Create with minimal parameters
    ocr = PaddleOCR(lang='en')
    
    init_time = time.time() - start_time
    print(f"✅ PaddleOCR initialized successfully in {init_time:.2f} seconds")
    
    # Check what models were loaded
    print(f"📦 OCR object created: {type(ocr)}")
    
except Exception as e:
    print(f"❌ Failed to initialize PaddleOCR: {e}")
    import traceback
    traceback.print_exc()

🔧 CELL 5: Initializing PaddleOCR...
⏳ This might take a few minutes for first-time setup...


[32mCreating model: ('PP-LCNet_x1_0_doc_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_doc_ori), the model files will be automatically downloaded and saved in C:\Users\alila\.paddlex\official_models.[0m
[32mCreating model: ('UVDoc', None)[0m
[32mUsing official model (UVDoc), the model files will be automatically downloaded and saved in C:\Users\alila\.paddlex\official_models.[0m
[32mCreating model: ('PP-LCNet_x1_0_textline_ori', None)[0m
[32mUsing official model (PP-LCNet_x1_0_textline_ori), the model files will be automatically downloaded and saved in C:\Users\alila\.paddlex\official_models.[0m
[32mCreating model: ('PP-OCRv5_server_det', None)[0m
[32mUsing official model (PP-OCRv5_server_det), the model files will be automatically downloaded and saved in C:\Users\alila\.paddlex\official_models.[0m
[32mCreating model: ('PP-OCRv5_server_rec', None)[0m
[32mUsing official model (PP-OCRv5_server_rec), the model files will be automatically downloaded and saved in 

✅ PaddleOCR initialized successfully in 3.72 seconds
📦 OCR object created: <class 'paddleocr._pipelines.ocr.PaddleOCR'>


In [7]:
print("🔧 CELL 6: Testing OCR on small test image...")

if 'ocr' in locals() and 'test_image_path' in locals():
    try:
        print("⏳ Processing small test image...")
        start_time = time.time()
        
        result = ocr.ocr(test_image_path)
        
        process_time = time.time() - start_time
        print(f"✅ Small image processed in {process_time:.2f} seconds")
        print(f"📊 Result type: {type(result)}")
        print(f"📊 Result content: {result}")
        
        # Extract text
        if result and result[0]:
            texts = []
            for detection in result[0]:
                if detection and len(detection) >= 2:
                    text = detection[1][0] if isinstance(detection[1], (list, tuple)) else str(detection[1])
                    texts.append(text)
            
            print(f"✅ Extracted {len(texts)} text elements:")
            for i, text in enumerate(texts):
                print(f"   {i+1}. {text}")
        else:
            print("❌ No text extracted from small image")
            
    except Exception as e:
        print(f"❌ Failed to process small image: {e}")
        import traceback
        traceback.print_exc()
else:
    print("⚠️ OCR not initialized or test image not created")

🔧 CELL 6: Testing OCR on small test image...
⏳ Processing small test image...


  result = ocr.ocr(test_image_path)


✅ Small image processed in 15.29 seconds
📊 Result type: <class 'list'>
📊 Result content: [{'input_path': 'test_small_resume.png', 'page_index': None, 'doc_preprocessor_res': {'input_path': None, 'page_index': None, 'input_img': array([[[255, ..., 255],
        ...,
        [255, ..., 255]],

       ...,

       [[255, ..., 255],
        ...,
        [255, ..., 255]]], dtype=uint8), 'model_settings': {'use_doc_orientation_classify': True, 'use_doc_unwarping': True}, 'angle': 0, 'rot_img': array([[[255, ..., 255],
        ...,
        [255, ..., 255]],

       ...,

       [[255, ..., 255],
        ...,
        [255, ..., 255]]], dtype=uint8), 'output_img': array([[[255, ..., 255],
        ...,
        [255, ..., 255]],

       ...,

       [[255, ..., 255],
        ...,
        [255, ..., 255]]], dtype=uint8)}, 'dt_polys': [array([[31, 50],
       ...,
       [31, 65]], dtype=int16), array([[34, 83],
       ...,
       [34, 98]], dtype=int16), array([[ 34, 118],
       ...,
       [ 34,

In [8]:
print("🔧 CELL 7: Testing OCR on resized original image...")

if 'ocr' in locals() and 'img' in locals():
    try:
        # Resize image to make it smaller and faster
        height, width = img.shape[:2]
        
        # Calculate new size (max 800 pixels on longest side)
        max_dimension = 800
        if max(height, width) > max_dimension:
            scale = max_dimension / max(height, width)
            new_width = int(width * scale)
            new_height = int(height * scale)
            
            resized_img = cv2.resize(img, (new_width, new_height))
            print(f"📐 Resized image from {width}x{height} to {new_width}x{new_height}")
        else:
            resized_img = img
            print(f"📐 Image size OK, no resizing needed: {width}x{height}")
        
        # Save resized image
        resized_path = "resized_resume.jpg"
        cv2.imwrite(resized_path, resized_img)
        print(f"💾 Saved resized image: {resized_path}")
        
        # Process with OCR
        print("⏳ Processing resized original image...")
        start_time = time.time()
        
        result = ocr.ocr(resized_path)
        
        process_time = time.time() - start_time
        print(f"✅ Resized image processed in {process_time:.2f} seconds")
        
        # Extract and display text
        if result and result[0]:
            texts = []
            for detection in result[0]:
                if detection and len(detection) >= 2:
                    text_info = detection[1]
                    if isinstance(text_info, (list, tuple)) and len(text_info) >= 1:
                        text = text_info[0]
                        confidence = text_info[1] if len(text_info) >= 2 else 1.0
                        
                        if text and text.strip():
                            texts.append((text.strip(), confidence))
            
            print(f"✅ Extracted {len(texts)} text elements from original image:")
            for i, (text, conf) in enumerate(texts[:10]):  # Show first 10
                print(f"   {i+1}. '{text}' (conf: {conf:.3f})")
            
            if len(texts) > 10:
                print(f"   ... and {len(texts) - 10} more")
            
            # Save results
            with open("extracted_text_debug.txt", "w", encoding="utf-8") as f:
                f.write("PaddleOCR Debug Results\n")
                f.write("=" * 30 + "\n")
                for text, conf in texts:
                    f.write(f"{text} (confidence: {conf:.3f})\n")
            
            print(f"💾 Results saved to: extracted_text_debug.txt")
            
        else:
            print("❌ No text extracted from resized image")
            
    except Exception as e:
        print(f"❌ Failed to process resized image: {e}")
        import traceback
        traceback.print_exc()
else:
    print("⚠️ OCR not initialized or original image not loaded")

🔧 CELL 7: Testing OCR on resized original image...
📐 Resized image from 1275x1650 to 618x800
💾 Saved resized image: resized_resume.jpg
⏳ Processing resized original image...


  result = ocr.ocr(resized_path)


✅ Resized image processed in 312.83 seconds
✅ Extracted 0 text elements from original image:
💾 Results saved to: extracted_text_debug.txt


In [9]:
print("🔧 CELL 8: Cleanup...")

try:
    # Clean up variables
    if 'ocr' in locals():
        del ocr
        print("✅ OCR object deleted")
    
    if 'img' in locals():
        del img
        print("✅ Image variables deleted")
    
    # Clean up temporary files
    temp_files = ["test_small_resume.png", "resized_resume.jpg"]
    for temp_file in temp_files:
        if os.path.exists(temp_file):
            os.remove(temp_file)
            print(f"🗑️ Deleted: {temp_file}")
    
    # Force garbage collection
    import gc
    gc.collect()
    print("✅ Garbage collection completed")
    
except Exception as e:
    print(f"⚠️ Cleanup warning: {e}")

print("\n" + "="*50)
print("🎯 DEBUGGING COMPLETE")
print("="*50)
print("Run cells one by one to identify where the problem occurs!")
print("If a cell hangs, interrupt it and skip to the next one.")

🔧 CELL 8: Cleanup...
✅ OCR object deleted
✅ Image variables deleted
🗑️ Deleted: test_small_resume.png
🗑️ Deleted: resized_resume.jpg
✅ Garbage collection completed

🎯 DEBUGGING COMPLETE
Run cells one by one to identify where the problem occurs!
If a cell hangs, interrupt it and skip to the next one.
