In [4]:
import pytesseract
from PIL import Image
import os
import re

try:
    import pdf2image
    PDF_SUPPORT = True
except ImportError:
    PDF_SUPPORT = False
    print("Warning: pdf2image not installed. PDF support is disabled.")

# Set Tesseract executable path
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe'

# Define fields to extract
FIELDS = {
    "Name": None,
    "Email": None,
    "Phone": None,
    "Address": None
}

# Function to extract text from an image
def extract_text_from_image(image):
    try:
        text = pytesseract.image_to_string(image, lang='eng')
        return text if text.strip() else "No text detected."
    except Exception as e:
        return f"Error extracting text: {e}"

# Function to process a file (image or PDF)
def process_file(file_path):
    if not os.path.exists(file_path):
        return f"Error: File '{file_path}' does not exist. Please check the path."

    file_extension = os.path.splitext(file_path)[1].lower()
    extracted_text = ""

    # Process image files
    if file_extension in ['.jpg', '.jpeg', '.png', '.bmp', '.tiff']:
        print(f"Processing image: {file_path}")
        try:
            image = Image.open(file_path)
            extracted_text = extract_text_from_image(image)
        except Exception as e:
            return f"Error opening image: {e}"

    # Process PDF files
    elif file_extension == '.pdf' and PDF_SUPPORT:
        print(f"Processing PDF: {file_path}")
        try:
            poppler_path = r'D:\poppler\Library\bin'  # Verify this matches your Poppler location
            images = pdf2image.convert_from_path(file_path, poppler_path=poppler_path)
            for i, image in enumerate(images):
                text = extract_text_from_image(image)
                extracted_text += f"Page {i+1}:\n{text}\n{'-'*50}\n"
        except Exception as e:
            return f"Error processing PDF: {e}"
    elif file_extension == '.pdf' and not PDF_SUPPORT:
        return "Error: PDF support requires pdf2image and Poppler. Please install them."
    else:
        return f"Error: Unsupported file type '{file_extension}'. Supported types: .jpg, .png, .pdf, etc."

    return extracted_text

# Function to map extracted text to fields
def map_text_to_fields(text):
    lines = text.splitlines()
    fields = FIELDS.copy()

    email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
    phone_pattern = r'(\+?\d{1,3}[-.\s]?)?\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}'

    for line in lines:
        line = line.strip()
        if not line:
            continue

        if "Name" in line and not fields["Name"]:
            fields["Name"] = line.replace("Name", "").strip()
        elif not fields["Name"] and len(line.split()) <= 3:
            fields["Name"] = line

        email_match = re.search(email_pattern, line)
        if email_match and not fields["Email"]:
            fields["Email"] = email_match.group(0)

        phone_match = re.search(phone_pattern, line)
        if phone_match and not fields["Phone"]:
            fields["Phone"] = phone_match.group(0)

        if ("Address" in line or (len(line.split()) > 5 and any(c.isdigit() for c in line))) and not fields["Address"]:
            fields["Address"] = line.replace("Address", "").strip()

    return fields

# Main program
def main():
    print("=== OCR Text Extraction Program with Field Mapping ===")
    print("Supported file types: Images (.jpg, .png, etc.), PDFs (if pdf2image is installed)")
    print("Fields to extract: Name, Email, Phone, Address")

    files = []
    while True:
        file_path = input("Enter the full path to your file (e.g., D:\\Resume.jpg) or 'done' to finish: ").strip()
        if file_path.lower() == 'done':
            break
        files.append(file_path)

    if not files:
        print("No files provided. Exiting.")
        return

    all_results = []
    for file_path in files:
        print(f"\nProcessing: {file_path}")
        result = process_file(file_path)
        
        if "Error" in result:
            print(result)
            continue
        
        mapped_fields = map_text_to_fields(result)
        all_results.append((file_path, result, mapped_fields))

    output_dir = "output"
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, "extracted_text_with_fields.txt")

    with open(output_file, "w", encoding="utf-8") as f:
        for file_path, raw_text, fields in all_results:
            print(f"\nResults for: {file_path}")
            print("Extracted Text:")
            print(raw_text)
            print("\nMapped Fields:")
            for field, value in fields.items():
                print(f"{field}: {value or 'Not found'}")
            
            f.write(f"\n{'='*50}\n")
            f.write(f"File: {file_path}\n")
            f.write("Extracted Text:\n")
            f.write(raw_text + "\n")
            f.write("Mapped Fields:\n")
            for field, value in fields.items():
                f.write(f"{field}: {value or 'Not found'}\n")

    print(f"\nAll results saved to: {output_file}")

if __name__ == "__main__":
    main()

=== OCR Text Extraction Program with Field Mapping ===
Supported file types: Images (.jpg, .png, etc.), PDFs (if pdf2image is installed)
Fields to extract: Name, Email, Phone, Address


Enter the full path to your file (e.g., D:\Resume.jpg) or 'done' to finish:  D:\Resume.jpg
Enter the full path to your file (e.g., D:\Resume.jpg) or 'done' to finish:  done



Processing: D:\Resume.jpg
Processing image: D:\Resume.jpg

Results for: D:\Resume.jpg
Extracted Text:
CURRICULUM VITAE

Name Goes Here
Address

Email

Phone

PERSONAL DETAILS

Date OF Birth
Sex

Marital Status
Nationality
Language Known

CAREER OBJECTIVE

of my talents & skills to hecome its strong asset.

EDUCATION QUALIFICATION

Strength

Punctuality
Hard Working
Ability to learn new th
Willing to do any

Hobbies

Listening Music
‘+ Playing Outdoor Games

{declare thatthe above information is true tothe best of my knowledge.

‘To be apart ofan organization where I ean devote myself fully and joyfully give oot



Mapped Fields:
Name: CURRICULUM VITAE
Email: Not found
Phone: Not found
Address: Not found

All results saved to: output\extracted_text_with_fields.txt
