In [1]:
import sys
sys.path.insert(0, '../src/core/')

from utils import (                                                             # type: ignore
    get_ollama_instance,
    pypdf2_get_text_from_pdf,
    pdfplumber_get_text_from_pdf,
    pdf2img_get_image_from_pdf,
    process_image,
    get_imgs_b64,
    show_image,
    extract_text_with_OCR
)

In [2]:
def pipeline(file_path: str):
    
    imgs = pdf2img_get_image_from_pdf(file_path)

    imgs = process_image(imgs)

    data = extract_text_with_OCR(imgs[0])

    prompt = """ 
You are a specialized assistant for extracting structured data from text. Your task is to analyze the provided data and extract specific fields without any additional commentary or explanations.

Data: {}

Please extract the following fields, ensuring accuracy and maintaining the specified formats:

{{
  "passenger_name": "Passenger's name in the format 'LASTNAME/FIRSTNAME', e.g., 'SILVA/RODRIGO'",
  "ticket_number": "Ticket number, typically a 10 to 15 character numeric or alphanumeric code, e.g., '0442155218038'",
  "booking_reference": "Booking reference (PNR), which can be an alphanumeric sequence, e.g., 'OJLPBI'",
  "flight_info": [
    {{
      "flight_number": "Flight number, usually in the format 'AIRLINE_CODE FLIGHT_NUMBER', e.g., 'AR 7709'",
      "departure_city": "City of departure, e.g., 'SAO PAULO GUARULH, BRAZIL'",
      "arrival_city": "City of arrival, e.g., 'AEROPARQUE JORGE NEWBERY (AEP)'",
      "departure_date": "Departure date formatted as 'DD/MMM/YYYY', e.g., '05/sep/2023'",
      "boarding_time": "Boarding time formatted as 'HH:MM', e.g., '09:10'",
      "departure_time": "Departure time formatted as 'HH:MM', e.g., '12:10'",
      "gate": "Gate number or description, e.g., 'TERMINAL 2' or 'Check monitors'",
      "class": "Flight class, can be economy, business, or first class, e.g., 'Cabina/Asiento' or '29E'",
      "seat": "Seat number, which can be a number or a combination of letter and number, e.g., '1' or '28F'",
      "baggage_info": {{
        "allowed": "Allowed baggage quantity, e.g., '1 piece', '2 pieces', 'None'",
        "weight": "Weight limit for baggage, e.g., 'Base fare: KLOWBXEP', '10kg', '20kg'"
      }},
      "status": "Flight status, which can be 'VALIDO PARA VIAJAR', 'ENTITLED TO BOARDING', 'CANCELADO', etc."
    }}
  ]
}}

Respond only with the JSON format, strictly adhering to the structure and ensuring all fields are filled correctly.
Don't respond opions or comments.
""".format(data)

    ollm = get_ollama_instance('llama3')
    response = ollm.invoke(prompt)
    return response


In [3]:
pipeline('./boardingpass_1.pdf')

'{\n  "passenger_name": "SILVA/RODRIGO",\n  "ticket_number": "0442155218038",\n  "booking_reference": "OJLPBI",\n  "flight_info": [\n    {\n      "flight_number": "AR 7709",\n      "departure_city": "SAO PAULO GUARULH, BRAZIL",\n      "arrival_city": "AEROPARQUE JORGE NEWBERY (AEP)",\n      "departure_date": "05/sep/2023",\n      "boarding_time": "09:10",\n      "departure_time": "12:10",\n      "gate": "TERMINAL 2 AEREAS",\n      "class": "Cabina/Asiento",\n      "seat": "",\n      "baggage_info": {\n        "allowed": "1 pieza",\n        "weight": ""\n      },\n      "status": "VALIDO PARA VIAJAR"\n    },\n    {\n      "flight_number": "AR 1258",\n      "departure_city": "SAO PAULO GUARULH, BRAZIL",\n      "arrival_city": "AEROPARQUE JORGE NEWBERY (AEP)",\n      "departure_date": "08/sep/2023",\n      "boarding_time": "16:10",\n      "departure_time": "18:50",\n      "gate": "",\n      "class": "Cabina/Asiento",\n      "seat": "",\n      "baggage_info": {\n        "allowed": "1 pieza

{
  "passenger_name": "SILVA/RODRIGO",
  "ticket_number": "0442155218038",
  "booking_reference": "OJLPBI",
  "flight_info": [
    {
      "flight_number": "AR 7709",
      "departure_city": "SAO PAULO GUARULH, BRAZIL",
      "arrival_city": "AEROPARQUE JORGE NEWBERY (AEP)",
      "departure_date": "05/sep/2023",
      "boarding_time": "09:10",
      "departure_time": "12:10",
      "gate": "TERMINAL 2 AEREAS",
      "class": "Cabina/Asiento",
      "seat": "",
      "baggage_info": { "allowed": "1 pieza", "weight": "" },
      "status": "VALIDO PARA VIAJAR"
    },
    {
      "flight_number": "AR 1258",
      "departure_city": "SAO PAULO GUARULH, BRAZIL",
      "arrival_city": "AEROPARQUE JORGE NEWBERY (AEP)",
      "departure_date": "08/sep/2023",
      "boarding_time": "16:10",
      "departure_time": "18:50",
      "gate": "",
      "class": "Cabina/Asiento",
      "seat": "",
      "baggage_info": { "allowed": "1 pieza", "weight": "" },
      "status": "VALIDO PARA VIAJAR"
    }
  ]
}


In [4]:
pipeline('./boardingpass_2.pdf')

'{\n  "passenger_name": "SILVA/RODRIGO",\n  "ticket_number": "0476623968947",\n  "flight_info": [\n    {\n      "flight_number": "TP675",\n      "departure_city": "AMSTERDAM SCHIPH",\n      "arrival_city": "LISBON",\n      "departure_date": "16 Feb 24",\n      "boarding_time": "07:00",\n      "departure_time": "06:10",\n      "gate": "Check monitors",\n      "class": "29E",\n      "seat": "A",\n      "baggage_info": {\n        "allowed": "",\n        "weight": ""\n      },\n      "status": "VALIDO PARA VIAJAR"\n    },\n    {\n      "flight_number": "TP87",\n      "departure_city": "LIS-LISBON AIRPORT (Terminal 1)",\n      "arrival_city": "SAO PAULO",\n      "departure_date": "16 Feb 24",\n      "boarding_time": "23:30",\n      "departure_time": "22:30",\n      "gate": "",\n      "class": "28F B",\n      "seat": "",\n      "baggage_info": {\n        "allowed": "",\n        "weight": ""\n      },\n      "status": "VALIDO PARA VIAJAR"\n    }\n  ]\n}'

{
  "passenger_name": "SILVA/RODRIGO",
  "ticket_number": "0476623968947",
  "flight_info": [
    {
      "flight_number": "TP675",
      "departure_city": "AMSTERDAM SCHIPH",
      "arrival_city": "LISBON",
      "departure_date": "16 Feb 24",
      "boarding_time": "07:00",
      "departure_time": "06:10",
      "gate": "Check monitors",
      "class": "29E",
      "seat": "A",
      "baggage_info": { "allowed": "", "weight": "" },
      "status": "VALIDO PARA VIAJAR"
    },
    {
      "flight_number": "TP87",
      "departure_city": "LIS-LISBON AIRPORT (Terminal 1)",
      "arrival_city": "SAO PAULO",
      "departure_date": "16 Feb 24",
      "boarding_time": "23:30",
      "departure_time": "22:30",
      "gate": "",
      "class": "28F B",
      "seat": "",
      "baggage_info": { "allowed": "", "weight": "" },
      "status": "VALIDO PARA VIAJAR"
    }
  ]
}


In [5]:
pipeline('./boardingpass_3.pdf')

'{\n  "passenger_name": "SILVA/RODRIGO",\n  "ticket_number": "930909836441203",\n  "booking_reference": "OB607",\n  "flight_info": [\n    {\n      "flight_number": "OB607",\n      "departure_city": "La Paz EI Alto intl Cochabamba J orge Wilsterman",\n      "arrival_city": "",\n      "departure_date": "20 Jul 2024",\n      "boarding_time": "10:35",\n      "departure_time": "20 Jul 2024",\n      "gate": "R18 Grupo4",\n      "class": "Asiento",\n      "seat": "3C",\n      "baggage_info": {\n        "allowed": "1 piece",\n        "weight": "23kg"\n      },\n      "status": ""\n    }\n  ]\n}'

{
  "passenger_name": "SILVA/RODRIGO",
  "ticket_number": "930909836441203",
  "booking_reference": "OB607",
  "flight_info": [
    {
      "flight_number": "OB607",
      "departure_city": "La Paz EI Alto intl Cochabamba J orge Wilsterman",
      "arrival_city": "",
      "departure_date": "20 Jul 2024",
      "boarding_time": "10:35",
      "departure_time": "20 Jul 2024",
      "gate": "R18 Grupo4",
      "class": "Asiento",
      "seat": "3C",
      "baggage_info": { "allowed": "1 piece", "weight": "23kg" },
      "status": ""
    }
  ]
}
