# Paddleocr

In [None]:
import gc
import pandas as pd
import os
from paddleocr import PaddleOCR
import time

# Initialize PaddleOCR
ocr = PaddleOCR(use_gpu=True, use_angle_cls=True, lang='en')

# Load the CSV file
csv_file = r"C:\Users\ariad\OneDrive\Desktop\Projects\Entity_extraction\Data\test.csv"
data = pd.read_csv(csv_file)

batch_size = 25 
start_batch_number = 0
end_batch_number = 5200  
total_rows = len(data)

# Function to perform OCR on image paths with error handling
def extract_text_from_image(image_path):
    try:
        if not os.path.exists(image_path):
            return 'Image not found'

        result = ocr.ocr(image_path, cls=True)
        extracted_text = ''
        if result is not None and isinstance(result, list):
            for line in result:
                if isinstance(line, list):
                    for word_info in line:
                        extracted_text += word_info[1][0] + ' '
        return extracted_text.strip() or 'No text found'

    except Exception as e:
        print(f"Error processing image {image_path}: {str(e)}")
        return 'Error processing image'

# Process batches from 61 to 100
batch_number = start_batch_number
start_row = batch_number * batch_size

while batch_number < end_batch_number and start_row < total_rows:
    end_row = min(start_row + batch_size, total_rows)
    batch = data.iloc[start_row:end_row]
    
    try:
        # Perform OCR on the batch
        batch['extracted_text'] = batch['image location'].apply(extract_text_from_image)
        
        # Save intermediate results to CSV
        output_file = f'D:\project\Data\OCR_test\ocr_extracted_data_batch_{batch_number}.csv'
        batch.to_csv(output_file, index=False, mode='a', header=not os.path.exists(output_file))
        
        print(f"Batch {batch_number} processed and saved to {output_file}.")
    
    except Exception as e:
        # Log batch-level errors and continue
        print(f"Error processing batch {batch_number}: {str(e)}")
        with open('error_log.txt', 'a') as log_file:
            log_file.write(f"Batch {batch_number} failed: {str(e)}\n")
    
    # Free memory
    del batch
    gc.collect()  # Perform garbage collection to free up memory
    time.sleep(0.5)
    # Update for the next batch
    batch_number += 1
    start_row += batch_size