In [191]:
import fitz  # PyMuPDF
from PIL import Image
import io

# Define the path to your PDF file
pdf_path = 'cv.pdf'

# Open the PDF
doc = fitz.open(pdf_path)

# List to hold each page's image
pdf_images = []

# Extract images from each page
for page_num in range(len(doc)):
    page = doc.load_page(page_num)  # Load the current page
    pix = page.get_pixmap()  # Render page to an image
    img = Image.open(io.BytesIO(pix.tobytes()))  # Convert the image to a PIL Image
    pdf_images.append(img)


In [192]:
# Load the image file
image_path = 'image.png'
image = Image.open(image_path)


In [193]:
import pytesseract

pytesseract.pytesseract.tesseract_cmd = "C:/Program Files/Tesseract-OCR/tesseract.exe"  # your path may be different

# Function to apply OCR on an image
def extract_text_from_image(img):
    text = pytesseract.image_to_string(img, lang='eng')  # You can change 'eng' to the appropriate language if needed
    return text

# Extract text from the standalone image
image_text = extract_text_from_image(image)

# Extract text from each PDF page image
pdf_texts = [extract_text_from_image(img) for img in pdf_images]


In [194]:
# Function to analyze visual components of each PDF page
def analyze_visual_components(page):
    # Extract images
    image_list = page.get_images(full=True)
    # Extract drawings (lines, rectangles, etc.)
    drawings = page.get_drawings()

    # This is a simplistic representation. You might need to expand this according to your needs.
    visuals = {
        'images-length': len(image_list),
        'drawings-length': len(drawings),
        'images': image_list,
        'drawings' : drawings
    }
    return visuals

# Analyze each page in the PDF
pdf_visuals = [analyze_visual_components(doc.load_page(i)) for i in range(len(doc))]


In [195]:
# Function to print and understand drawings from a PDF page
def print_drawings(drawings):
    for i, drawing in enumerate(drawings):
        print(f"Drawing {i + 1}:")
        for key, value in drawing.items():
            # Simplify the printing of complex structures like points
            if isinstance(value, list) and all(isinstance(item, (list, tuple)) for item in value):
                print(f"  {key}: [")
                for point in value:
                    print(f"    {point},")
                print("  ]")
            else:
                print(f"  {key}: {value}")
        print("")  # Add an empty line for better separation between drawings

# Example usage:
# Assuming pdf_visuals is a list of visuals from each page as in your provided code
for i, visuals in enumerate(pdf_visuals):
    print(f"Page {i + 1}:")
    print_drawings(visuals['drawings'])


Page 1:
Drawing 1:
  items: [
    ('re', Rect(0.0, 0.0, 187.34255981445312, 841.542724609375), -1),
  ]
  type: f
  even_odd: False
  fill_opacity: 1.0
  fill: (0.27059999108314514, 0.34119999408721924, 0.40779998898506165)
  rect: Rect(0.0, 0.0, 187.34255981445312, 841.542724609375)
  seqno: 0
  layer: 
  closePath: None
  color: None
  width: None
  lineCap: None
  lineJoin: None
  dashes: None
  stroke_opacity: None

Drawing 2:
  items: [
    ('re', Rect(0.0, 0.0, 187.34255981445312, 100.41561126708984), -1),
  ]
  type: f
  even_odd: False
  fill_opacity: 1.0
  fill: (0.8549000024795532, 0.6470999717712402, 0.0)
  rect: Rect(0.0, 0.0, 187.34255981445312, 100.41561126708984)
  seqno: 22
  layer: 
  closePath: None
  color: None
  width: None
  lineCap: None
  lineJoin: None
  dashes: None
  stroke_opacity: None

Drawing 3:
  items: [
    ('l', Point(29.974807739257812, 343.2115478515625), Point(29.974807739257812, 342.4621887207031)),
    ('c', Point(29.974807739257812, 342.46218872

In [196]:
# Example function to clean OCR text
def clean_text(raw_text):
    # Implement text cleaning here. This could include removing whitespace, correcting OCR mistakes, etc.
    cleaned_text = raw_text.strip()  # This is a placeholder, adapt as needed.
    return cleaned_text

# Clean the extracted texts
cleaned_image_text = clean_text(image_text)
cleaned_pdf_texts = [clean_text(text) for text in pdf_texts]


In [197]:
def convert_point(point):
    return (point.x, point.y)

def convert_shape(item):
    action, *args = item
    converted_args = []
    for arg in args:
        if isinstance(arg, fitz.Rect):
            converted_args.append((arg.x0, arg.y0, arg.x1, arg.y1))
        elif isinstance(arg, fitz.Point):
            converted_args.append((arg.x, arg.y))
        else:
            converted_args.append(arg)
    return (action, *converted_args)

def convert_drawings(drawings):
    serializable_drawings = []
    for drawing in drawings:
        converted = {k: (v if not isinstance(v, (fitz.Rect, fitz.Point)) else
                         (v.x0, v.y0, v.x1, v.y1) if isinstance(v, fitz.Rect) else
                         (v.x, v.y)) for k, v in drawing.items()}
        
        # Convert the items list
        converted_items = [convert_shape(item) for item in drawing.get('items', [])]
        converted['items'] = converted_items
        serializable_drawings.append(converted)
    return serializable_drawings

# Apply conversion to all pages' drawings in pdf_visuals
for page_visual in pdf_visuals:
    page_visual['drawings'] = convert_drawings(page_visual['drawings'])



In [198]:
import json

# Combine all extracted data into a dictionary
extracted_data = {
    'image_text': cleaned_image_text,
    'pdf_texts': cleaned_pdf_texts,
    'pdf_visuals': pdf_visuals
}

# Convert dictionary to JSON and save to a file
with open('extracted_data_raw.json', 'w') as json_file:
    json.dump(extracted_data, json_file, indent=4)


In [199]:
import json

# Function to convert complex JSON drawing details into a more readable format
def convert_to_readable_format(data):
    readable_data = []
    
    for page in data['pdf_visuals']:
        page_data = {
            'page_images': len(page['images']),
            'drawings': []
        }
        
        for drawing in page['drawings']:
            
            
            # Determine the type and coordinates from items
            for item in drawing['items']:
                drawing_data = {
                'type': 'Unknown',  # Default type, will change based on actual data
                'coordinates': [],
                'properties': {}
            }
                if item[0] == 're':
                    drawing_data['type'] = 'Rectangle'
                    x, y, w, h = item[1]
                    drawing_data['coordinates'] = {'x': x, 'y': y, 'width': w, 'height': h}
                elif item[0] == 'l':
                    # print(item)
                    drawing_data['type'] = 'Line'
                    drawing_data['coordinates'] = {'start': {'x': item[1][0], 'y': item[1][1]}, 'end': {'x': item[2][0], 'y': item[2][1]}}
                    # print(item)
                elif item[0] == 'c':
                    drawing_data['type'] = 'Curve'
                    drawing_data['coordinates'] = {'points': item[1:]}
                # Add more shapes as needed
            
                
                    
                # Add other properties
                drawing_data['properties'] = {
                    'fill_color': drawing.get('fill', 'None'),
                    'fill_opacity': drawing.get('fill_opacity', 'None'),
                    'stroke_color': drawing.get('color', 'None'),
                    'stroke_opacity': drawing.get('stroke_opacity', 'None'),
                    # Add more properties as needed
                }
                

                
                page_data['drawings'].append(drawing_data)
            
        readable_data.append(page_data)
    
    return readable_data



# Load the actual JSON data from the provided file and then convert it using the defined function
with open("extracted_data_raw.json", 'r') as file:
    actual_data = json.load(file)

# Use the previously defined function to convert this actual data
readable_actual_data = convert_to_readable_format(actual_data)

with open('extracted_data_readable.json', 'w') as outfile:
    json.dump(readable_actual_data, outfile, indent=4)




In [200]:
import json

def update_readable_format(data):
    updated_data = []
    for page in data:
        updated_drawings = []
        for drawing in page['drawings']:
            updated_drawing = {}
            updated_drawing['type'] = drawing['type']
            
            # Update the coordinates to be more descriptive
            if drawing['type'] in ['Rectangle', 'Line']:
                updated_drawing['coordinates'] = drawing['coordinates']
            elif drawing['type'] == 'Curve':
                updated_drawing['coordinates'] = {'control_points': drawing['coordinates']['points']}
            
            # Convert RGB colors to hexadecimal
            updated_properties = drawing['properties']
            if updated_properties['fill_color']:
                r, g, b = [int(c * 255) for c in updated_properties['fill_color']]
                updated_properties['fill_color'] = f'#{r:02x}{g:02x}{b:02x}'
            if updated_properties.get('stroke_color'):
                r, g, b = [int(c * 255) for c in updated_properties.get('stroke_color', [0,0,0])]
                updated_properties['stroke_color'] = f'#{r:02x}{g:02x}{b:02x}'
            
            updated_drawing['properties'] = updated_properties
            updated_drawings.append(updated_drawing)
        
        updated_data.append({'page_images': page['page_images'], 'drawings': updated_drawings})

    return updated_data

# Assuming 'json_data' is your loaded JSON structure
with open('extracted_data_readable.json', 'r') as f:
    json_data = json.load(f)

updated_json_data = update_readable_format(json_data)

# Optionally, write the updated JSON back to a file
with open('extracted_data_final.json', 'w') as f:
    json.dump(updated_json_data, f, indent=4)
