In [None]:
import json
import random
import io
import ast
from PIL import Image
from pdf2image import convert_from_path
from IPython.display import Markdown, display
from openai import OpenAI
import os
import pymupdf
import base64
import shutil
import pandas as pd
import numpy as np
import time

In [None]:
# @title Parsing JSON output
def parse_json(json_output):
    # Parsing out the markdown fencing
    lines = json_output.splitlines()
    for i, line in enumerate(lines):
        if line == "```json":
            json_output = "\n".join(lines[i+1:])  # Remove everything before "```json"
            json_output = json_output.split("```")[0]  # Remove everything after the closing "```"
            break  # Exit the loop once "```json" is found
    return json_output

In [None]:
# @title inference function
def inference(image_path, prompt, sys_prompt="You are a helpful assistant.", max_new_tokens=4096, return_input=False):
    image = Image.open(image_path)
    image_local_path = "file://" + image_path
    messages = [
        {"role": "system", "content": sys_prompt},
        {"role": "user", "content": [
                {"type": "text", "text": prompt},
                {"image": image_local_path},
            ]
        },
    ]
    text = processor.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
    #print("text:", text)
    image_inputs, video_inputs = process_vision_info([messages])
    inputs = processor(text=[text], images=[image], padding=True, return_tensors="pt")
    inputs = inputs.to('mps')

    output_ids = model.generate(**inputs, max_new_tokens=max_new_tokens)
    generated_ids = [output_ids[len(input_ids):] for input_ids, output_ids in zip(inputs.input_ids, output_ids)]
    output_text = processor.batch_decode(generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=True)
    if return_input:
        return output_text[0], inputs
    else:
        return output_text[0]

In [None]:
import torch
from transformers import Qwen2_5_VLForConditionalGeneration, AutoTokenizer, AutoProcessor
from qwen_vl_utils import process_vision_info

# default: Load the model on the available device(s)
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
    "Qwen/Qwen2.5-VL-7B-Instruct", torch_dtype=torch.float16, device_map="auto"
)
# default processer
processor = AutoProcessor.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2.5-VL-7B-Instruct")

In [None]:
prompt = """
You are extracting structured information from a historical Swedish patent card. Please fill in the following fields with high accuracy. Be precise, avoid modernizing historical data, and include only what is requested. All dates must be formatted as yyyy-mm-dd.
OUTPUT IN JSON FORMAT PLEASE

Patent number: (Patent number, output format:a six-digit number)
klass: (Patent klass (original in DPK), marked "klass". Output FORMAT example: "37 b:1/10" or "39 a6:21/00". DO NOT USE EXPONENT. Not available for patent applied before 1968)
IPC: (Patent klass in IPC, marked "Int. Cl.". Output format example: "C 07 d 9/00". Not available for patent applied before 1968)
patenthavare_antal:(The amount of patent holders)
patenthavare1:(First patent holder name. KEEP ONLY THE NAME, NO LOCATION)
patenthavare1_stad:(City of the patent holder, ususally separated by a comma from the name. KEEP ONLY THE CITY NAME)
patenthavare1_land:(Country of the patent holder. KEEP THE HISTORICAL NAME. DO NOT CONVERT INTO MODERN NAMES)
patenthavare_typ:(Type of the patent holder either: 'individual', 'institution', 'company', 'mixed')
patenthavare_typ_av_skrivet:(Type of the record of patent holder. Infer from your vision understanding and analyze carefully, either: 'typed', 'mixed', 'handwritten')
patenthavare2:(If the patentee is more than 1, Name of second patent holder. KEEP ONLY THE NAME, NO LOCATION.)
patenthavare2_stad:(City of the second patent holder, ususally separated by a comma from the name. KEEP ONLY THE CITY NAME)
patenthavare2_land:(Country of the second patent holder. KEEP THE HISTORICAL NAME. DO NOT CONVERT INTO MODERN NAMES)
ombud:(Patent agent. USE ONLY THE NOT CROSSED-OUT ONES. KEEP ONLY THE NAME, including "m fl" if available. NO LOCATION)
ombud_adress:(Patent agent address, if available)
patenttid_fr:(Application date. OUTPUT FORMAT:yyyy-mm-dd)
patenttid_till: (Expected expiration date. OUTPUT FORMAT:yyyy-mm-dd)
beviljandedatum:(Grant date, marked “Patent meddelat”. OUTPUT FORMAT:yyyy-mm-dd)
utgångsdatum: (Actual expiration date, when the patent marked "Kung.förf. or Avförd". OUTPUT FORMAT:yyyy-mm-dd)
utgångsskäl:(Plausible expiration reason, when the patent marked Kung.förf. or Avförd', either: ['Lack of payment of fees', 'Expiration of patent time', 'Compulsory working clause', 'Litigation'])
ansökingsnr:(Application number. DO NOT OMIT HANDWRITTEN PART)
sistaerlagdapatentåravgifter_datum:(Date of last patent fee record before expiration. YEAR MAYBE TYPED IN ANOTHER DIRECTION. OUTPUT FORMAT:yyyy-mm-dd)
sistaerlagdapatentåravgifter_belopp:(Amount of last patent fee record before expiration (krona). OUTPUT FORMAT:number with two decimals)
sista_aviserat_datum:(Date of the last "Aviserat" mark on the patent card, some patents may have no this stamp. OUTPUT FORMAT: yyyy-mm-dd)
uppfinningensbenämning:(Title of the invention)
uppfinningensbenämning_typ:(Type of the invention title. Infer this from your database and visual understanding. Compare very carefully',either ['typed', 'mixed', 'handwritten'])
uppfinnare_antal:(The amount of inventors)
uppfinnare1:(First inventor, fill in the value of patenthavare name if the patent holder is the inventor. NO LOCATION)
uppfinnare2:(Second inventor. NO LOCATION)
uppfinnare3:(Third inventor. NO LOCATION)
uppfinnare4:(Fourth inventor. NO LOCATION)
prioritet:(Priority patent or not, either [‘TRUE’ 'FALSE'])
prioritetsdatum:(Priority patent date. USE THE FIRST ONE, IF THERE ARE MORE THAN ONE. OUTPUT FORMAT: yyyy-mm-dd)
prioritetsland:(Priority patent country. KEEP THE HISTORICAL NAME. DO NOT CONVERT INTO MODERN NAMES)
patentöverföring:(Patent transfer or not,either [‘TRUE’ 'FALSE'])
överföringsdatum:(Transfer date, output format:yyyy-mm-dd)
tidigare_patenthavare:(Previous patent holder, usually crossed out on the card. ONLY THE NAME. NO LOCATION)
tidigare_patenthavare_stad:(City of the previous patent holder, usually crossed out on the card.KEEP ONLY THE CITY NAME)
tidigare_patenthavare_land:(Country of the previous patent holder)
licensupplåtelse:(Patent licensing or not, either [‘TRUE’ 'FALSE'])
tilläggspatent:(Supplimentary patent or not, only true if the tilläggspatentnummer can be found, either [‘TRUE’ 'FALSE'])
tilläggspatentnummer:(Supplimentary patent number)
ombudsbyte:(Patent agent change or not, either [‘TRUE’ 'FALSE'])
tidigare_ombud:(Former patent agent, usually crossed out on the card. ONLY THE NAME. NO LOCATION)
ombudsbytesdatum:(Patent agent change date)
"""


In [None]:
#preprocess of the pdfs
#combine them into one-pages and convert to image

def combine_two_page (input_path):
    input_pdf = pymupdf.open(input_path)
    if len(input_pdf) != 2: 
        print (f"Transfering {input_path}, not two-page")
        file_name = os.path.splitext(os.path.basename(input_path))[0]
        output_path = os.path.join(output_dir,f"{file_name}.pdf")
        image_path = os.path.join(output_dir,f"{file_name}.jpg")
        input_pdf.save(output_path)

        images = convert_from_path(output_path)
        images[0].save(image_path)
        return
    #create new pdf
    output_pdf = pymupdf.open()
    new_page = output_pdf.new_page(width = 2*input_pdf[0].rect.width, height = input_pdf[0].rect.height)
    left_rect = pymupdf.Rect(0, 0, input_pdf[0].rect.width, input_pdf[0].rect.height)
    right_rect = pymupdf.Rect(input_pdf[0].rect.width, 0, 2*input_pdf[0].rect.width, input_pdf[0].rect.height)
    new_page.show_pdf_page(left_rect, input_pdf, 0)
    new_page.show_pdf_page(right_rect, input_pdf, 1)

    file_name = os.path.splitext(os.path.basename(input_path))[0]
    output_path = os.path.join(output_dir,f"{file_name}.pdf")
    image_path = os.path.join(output_dir,f"{file_name}.jpg")
    output_pdf.save(output_path)

    images = convert_from_path(output_path)
    images[0].save(image_path)
    

    print(f"{file_name} combined and saved")

def  process_files_in_folder(folder_path, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            input_path = os.path.join(folder_path, file)
            combine_two_page(input_path)

In [None]:
folder_path = # Folder of the patent cards
output_dir = 'cards_in_image'# Temporary directory

process_files_in_folder(folder_path, output_dir)

In [None]:
def resize_image(input_path, max_size=(256, 256)):
    with Image.open(input_path) as img:
        img = img.convert("RGB")  # Ensure it's in RGB
        img.thumbnail(max_size)  # Resize while maintaining aspect ratio
        img.save(input_path, optimize=True, quality=85)  # Overwrite original

In [None]:
#create image path of base64 images
image_dir = output_dir

results = []

for file in os.listdir(image_dir):
    if file.endswith(".jpg"):
        image_path = os.path.join(image_dir, file)
        
        # Resize image before inference
        resize_image(image_path, max_size=(512, 512))

        # Extract patent number from filename
        patentnummer = os.path.splitext(file)[0]
        print(image_path)

        # --- Start timing ---
        start_time = time.time()

        # --- Run inference ---
        response = inference(image_path, prompt)

        # --- End timing ---
        elapsed_time = time.time() - start_time

        # --- Clean response if it's wrapped in Markdown or code block ---
        if response.startswith("```"):
            response = response.strip().strip("`").strip("json").strip()

        # --- Token counting ---
        input_tokens = tokenizer.encode(prompt, add_special_tokens=False)
        input_token_count = len(input_tokens)

        output_tokens = tokenizer.encode(response, add_special_tokens=False)
        output_token_count = len(output_tokens)

        print(f"🕒 Time: {elapsed_time:.2f} sec | 🔡 Input tokens: {input_token_count} | 📝 Output tokens: {output_token_count}")
        
        try:
            parsed_data = json.loads(response)
            parsed_data["patent_nummer"] = patentnummer
            results.append(parsed_data)
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON for {file}: {e}")
            print("Response was:", repr(response))


In [None]:
# Preview as a DataFrame
df = pd.DataFrame(results)
display(df)

In [None]:
# Export to CSV
csv_file = 'results-with_instructions.csv'
df.to_csv(csv_file, index=False, encoding='utf-8')
print(f"Results exported to {csv_file}")