In [129]:
from pdf2image import convert_from_path
from PyPDF2 import PdfReader, PdfWriter
from io import BytesIO
import base64
from io import BytesIO
from PIL import Image # Assuming you are using Pillow for image manipulation
import json
import boto3
import ast
from datetime import datetime
from dateutil.parser import parse

# Pipeline to clean the PDF

In [17]:
def pdf_to_images(path):
    """Function that converts the pdf to a lit of b64 images"""
    images = convert_from_path(path)
    list_b64 = []
    for image in images:
        img_buffer = BytesIO()
        image.save(img_buffer, format='PNG') # Specify the format (e.g., 'PNG', 'JPEG')
        byte_data = img_buffer.getvalue()
        base64_bytes = base64.b64encode(byte_data).decode('utf-8')
        list_b64.append(base64_bytes)
    return list_b64

In [151]:
def body_builder(list_b64, prompt):
    """Function that arrange the image and prompt at the format expected by Anthropic"""
    lst = [{'type' : 'text','text' : prompt}]
    stacking = [{'type' : 'image','source' : {'type' : 'base64','media_type' : 'image/png','data' : image_b64}} for image_b64 in list_b64]
    content = lst + stacking
    body = json.dumps(
        {
            'anthropic_version': 'bedrock-2023-05-31',
            'max_tokens': 2048,
            'messages': [{
                'role' : 'user',
                'content' : content
            }]
        }
    )
    return body

In [152]:
def extract_image(body):
    """
    VLM call to extract content from an image
    Args:
        body (str) : body of the request
    Returns:
        The response from the model
    """
    model_id = 'anthropic.claude-3-5-sonnet-20240620-v1:0'
    accept = '*/*'
    content_type = 'application/json'

    region_name = boto3.Session().region_name
    bedrock = boto3.client(service_name='bedrock-runtime', region_name=region_name)
    
    response = bedrock.invoke_model(
        body=body,
        modelId=model_id,
        accept=accept,
        contentType=content_type
    )
    
    r_body = json.loads(response.get('body').read())
    return r_body

In [153]:
def clean_pdf(indexes, path):
    """Extract a few pages from the pdf"""
    reader = PdfReader (path)
    writer = PdfWriter()
    for index in indexes:
        writer.add_page (reader.pages[index])
    with open(path[:-4] + 'eng_clean.pdf', "wb") as output_pdf:
        writer.write(output_pdf)    

In [158]:
def remove_disclaimers(prompt, path):
    """Whole pipeline to clean up the pdf putting all together"""
    # Get the list of image
    list_b64 = pdf_to_images(path)
    # Build my payload for llm
    body = body_builder(list_b64, prompt)
    # Send payload
    r = extract_image(body)
    print(r)
    # Process response
    indexes = ast.literal_eval(r['content'][0]['text'])
    # Rebuild pdf
    clean_pdf(indexes, path)
    return r

In [159]:
# Prompt and path for the testing
path = 'test_files/XS2845792592.pdf'
prompt = 'I am passing you a list of images that come from a financial term sheet, this financial term sheet contains information relative to the financial asset in some pages, but some other pages contain only disclaimers statements of risk and other legal stuff, can you please send me back an array of the index of the images that contain information relative to the asset, make sure to only pass me back the array of indexes and nothing else'
r = remove_disclaimers(prompt, path)

{'id': 'msg_bdrk_01QFekGZvA81ibHcUDuNS7od', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-5-sonnet-20240620', 'content': [{'type': 'text', 'text': '[0, 1, 2, 3, 4]'}], 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 12523, 'output_tokens': 18}}


In [160]:
r

{'id': 'msg_bdrk_01QFekGZvA81ibHcUDuNS7od',
 'type': 'message',
 'role': 'assistant',
 'model': 'claude-3-5-sonnet-20240620',
 'content': [{'type': 'text', 'text': '[0, 1, 2, 3, 4]'}],
 'stop_reason': 'end_turn',
 'stop_sequence': None,
 'usage': {'input_tokens': 12523, 'output_tokens': 18}}

# Pipeline to extract the JSON from the PDF

In [2]:
json_example = {
  "issuer_lei": "W22LROWP2IHZNBB6K528",
  "isin": "XS2773440412",
  "currency": "EUR",
  "underlyings": [
    {
      "name": "Mercedes-Benz Group AG",
      "currency": "EUR",
      "isin": "DE0007100000",
      "bloomberg_ticker": "MBG GY",
      "fixing_price_original": "74.0800"
    },
    {
      "name": "SAP SE",
      "currency": "EUR",
      "isin": "DE0007164600",
      "bloomberg_ticker": "SAP GY",
      "fixing_price_original": "174.7400"
    },
    {
      "name": "NOVO NORDISK A/S",
      "currency": "DKK",
      "isin": "DK0062498333",
      "bloomberg_ticker": "NOVOB DC",
      "fixing_price_original": "888.0000"
    }
  ],
  "initial_fixing_date": "2024-03-21",
  "issue_date": "2024-04-04",
  "final_fixing_date": "2025-10-06",
  "maturity_date": "2025-10-08",
  "coupon_schedule": [
    {
      "fixing_date": "2024-05-06",
      "settlement_date": "2024-05-08"
    },
    {
      "fixing_date": "2024-06-04",
      "settlement_date": "2024-06-06"
    },
    {
      "fixing_date": "2024-07-04",
      "settlement_date": "2024-07-08"
    },
    {
      "fixing_date": "2024-08-05",
      "settlement_date": "2024-08-07"
    },
    {
      "fixing_date": "2024-09-04",
      "settlement_date": "2024-09-06"
    },
    {
      "fixing_date": "2024-10-04",
      "settlement_date": "2024-10-08"
    },
    {
      "fixing_date": "2024-11-04",
      "settlement_date": "2024-11-06"
    },
    {
      "fixing_date": "2024-12-04",
      "settlement_date": "2024-12-06"
    },
    {
      "fixing_date": "2025-01-06",
      "settlement_date": "2025-01-08"
    },
    {
      "fixing_date": "2025-02-04",
      "settlement_date": "2025-02-06"
    },
    {
      "fixing_date": "2025-03-04",
      "settlement_date": "2025-03-06"
    },
    {
      "fixing_date": "2025-04-04",
      "settlement_date": "2025-04-08"
    },
    {
      "fixing_date": "2025-05-05",
      "settlement_date": "2025-05-07"
    },
    {
      "fixing_date": "2025-06-04",
      "settlement_date": "2025-06-06"
    },
    {
      "fixing_date": "2025-07-04",
      "settlement_date": "2025-07-08"
    },
    {
      "fixing_date": "2025-08-04",
      "settlement_date": "2025-08-06"
    },
    {
      "fixing_date": "2025-09-04",
      "settlement_date": "2025-09-08"
    },
    {
      "fixing_date": "2025-10-06",
      "settlement_date": "2025-10-08"
    }
  ],
  "autocall_schedule": [
    {
      "observation_date": "2024-10-04",
      "settlement_date": "2024-10-08"
    },
    {
      "observation_date": "2024-11-04",
      "settlement_date": "2024-11-06"
    },
    {
      "observation_date": "2024-12-04",
      "settlement_date": "2024-12-06"
    },
    {
      "observation_date": "2025-01-06",
      "settlement_date": "2025-01-08"
    },
    {
      "observation_date": "2025-02-04",
      "settlement_date": "2025-02-06"
    },
    {
      "observation_date": "2025-03-04",
      "settlement_date": "2025-03-06"
    },
    {
      "observation_date": "2025-04-04",
      "settlement_date": "2025-04-08"
    },
    {
      "observation_date": "2025-05-05",
      "settlement_date": "2025-05-07"
    },
    {
      "observation_date": "2025-06-04",
      "settlement_date": "2025-06-06"
    },
    {
      "observation_date": "2025-07-04",
      "settlement_date": "2025-07-08"
    },
    {
      "observation_date": "2025-08-04",
      "settlement_date": "2025-08-06"
    },
    {
      "observation_date": "2025-09-04",
      "settlement_date": "2025-09-08"
    }
  ],
  "autocall_percent_value": "100.0000",
  "coupon_percent_value": "9.9600",
  "knock_in_barrier_level": None,
  "strike_level": "79.0000",
  "stepdown_level": None,
  "knock_in_observation": "",
  "first_autocall_observation_date": "2024-10-04",
  "autocall_frequency": "Monthly",
  "autocall_observation_style": "End of Period",
  "autocall_non_call_periods": None,
  "coupon_frequency": "Monthly"
}

In [190]:
def get_json(path_clean,prompt):
    # Get the list of image
    list_b64 = pdf_to_images(path_clean)
    # Build my payload for llm
    body = body_builder(list_b64, prompt)
    # Send payload
    r = extract_image(body)
    output_json = json.loads(r['content'][0]['text'])
    return output_json, r

In [191]:
path_clean = 'test_files/XS2845792592eng_clean.pdf'
prompt = f"""I have this json example {json.dumps(json_example)} and I give you a document containing information about a financial asset, can you please replace the fields and output me back the json with the information of the right document filled, make sure to only give me the json so I can parse it properly, do not put break lines, also take into account that the values of 'first_autocall_observation_date': '17 January 2025',
 'autocall_frequency': 'Monthly' or 'Daily'
 'autocall_observation_style': 'End of Period' or 'Daily Close',
 'coupon_frequency': 'Monthly' or 'Daily'
 """
json_claude, r = get_json(path_clean,prompt)

In [192]:
json_claude

{'issuer_lei': '549300QNMDBVTHX8H127',
 'isin': 'XS2845792592',
 'currency': 'AUD',
 'underlyings': [{'name': 'NVIDIA CORP',
   'currency': 'USD',
   'isin': None,
   'bloomberg_ticker': 'NVDA UW',
   'fixing_price_original': '121.6000'},
  {'name': 'ADVANCED MICRO DEVICES',
   'currency': 'USD',
   'isin': None,
   'bloomberg_ticker': 'AMD UW',
   'fixing_price_original': '164.1500'},
  {'name': 'ALPHABET INC-CL A',
   'currency': 'USD',
   'isin': None,
   'bloomberg_ticker': 'GOOGL UW',
   'fixing_price_original': '184.7700'}],
 'initial_fixing_date': '2024-07-17',
 'issue_date': '2024-07-17',
 'final_fixing_date': '2026-01-20',
 'maturity_date': '2026-01-22',
 'coupon_schedule': [{'fixing_date': '2024-08-19',
   'settlement_date': '2024-08-21'},
  {'fixing_date': '2024-09-17', 'settlement_date': '2024-09-19'},
  {'fixing_date': '2024-10-17', 'settlement_date': '2024-10-21'},
  {'fixing_date': '2024-11-18', 'settlement_date': '2024-11-20'},
  {'fixing_date': '2024-12-17', 'settlemen

# Validation

In [180]:
with open('test_files/sample_output/XS2845792592.json', 'r') as f:
    data = f.read()
json_lgt = json.loads(data)
json_lgt

{'issuer_lei': '549300QNMDBVTHX8H127',
 'isin': 'XS2845792592',
 'currency': 'AUD',
 'underlyings': [{'name': 'Advanced Micro Devices Inc',
   'currency': 'USD',
   'isin': 'US0079031078',
   'bloomberg_ticker': 'AMD UQ',
   'fixing_price_original': '164.1500'},
  {'name': 'Alphabet Inc (Class A)',
   'currency': 'USD',
   'isin': 'US02079K3059',
   'bloomberg_ticker': 'GOOGL UQ',
   'fixing_price_original': '184.7700'},
  {'name': 'NVIDIA Corp',
   'currency': 'USD',
   'isin': 'US67066G1040',
   'bloomberg_ticker': 'NVDA UQ',
   'fixing_price_original': '121.6000'}],
 'initial_fixing_date': '2024-07-03',
 'issue_date': '2024-07-17',
 'final_fixing_date': '2026-01-20',
 'maturity_date': '2026-01-22',
 'coupon_schedule': [{'fixing_date': '2024-08-19',
   'settlement_date': '2024-08-21'},
  {'fixing_date': '2024-09-17', 'settlement_date': '2024-09-19'},
  {'fixing_date': '2024-10-17', 'settlement_date': '2024-10-21'},
  {'fixing_date': '2024-11-18', 'settlement_date': '2024-11-20'},
  {

In [181]:
json_claude

{'issuer_lei': '549300QNMDBVTHX8H127',
 'isin': 'XS2845792592',
 'currency': 'AUD',
 'underlyings': [{'name': 'NVIDIA CORP',
   'currency': 'USD',
   'isin': '',
   'bloomberg_ticker': 'NVDA UW',
   'fixing_price_original': '121.6000'},
  {'name': 'ADVANCED MICRO DEVICES',
   'currency': 'USD',
   'isin': '',
   'bloomberg_ticker': 'AMD UW',
   'fixing_price_original': '164.1500'},
  {'name': 'ALPHABET INC-CL A',
   'currency': 'USD',
   'isin': '',
   'bloomberg_ticker': 'GOOGL UW',
   'fixing_price_original': '184.7700'}],
 'initial_fixing_date': '2024-07-17',
 'issue_date': '2024-07-17',
 'final_fixing_date': '2026-01-20',
 'maturity_date': '2026-01-22',
 'coupon_schedule': [{'fixing_date': '2024-08-19',
   'settlement_date': '2024-08-21'},
  {'fixing_date': '2024-09-17', 'settlement_date': '2024-09-19'},
  {'fixing_date': '2024-10-17', 'settlement_date': '2024-10-21'},
  {'fixing_date': '2024-11-18', 'settlement_date': '2024-11-20'},
  {'fixing_date': '2024-12-17', 'settlement_date

In [182]:
output_json = json.loads(r['content'][0]['text'])
with open('test_files/sample_output/XS2845792592.json', 'r') as f:
    data = f.read()
json_lgt = json.loads(data)

def check_json(json_lgt, json_claude):
    for key, value in json_lgt.items():
        # Special treatment for dates to convert them
        if key in ['initial_fixing_date','issue_date','maturity_date','final_fixing_date','first_autocall_observation_date']:
            json_lgt[key] = parse(json_lgt[key])
            json_claude[key] = parse(json_claude[key])
        # Special treatment for underlyings

        # Special treatment for schedule
        if key == 'coupon_schedule':
            json_lgt[key] = [{'fixing_date': parse(item['fixing_date']), 'settlement_date': parse(item['settlement_date'])} for item in json_lgt[key]]
            json_claude[key] = [{'fixing_date': parse(item['fixing_date']), 'settlement_date': parse(item['settlement_date'])} for item in json_claude[key]]

    # Verification
    for key, value in json_lgt.items():
        if json_lgt[key] == json_claude[key]:
            print(f' {key} is correct')
        else:
            print(f' {key} is not correct lgt {json_lgt[key]} ----- claude {json_claude[key]}')

check_json(json_lgt, json_claude)  

 issuer_lei is correct
 isin is correct
 currency is correct
 underlyings is not correct lgt [{'name': 'Advanced Micro Devices Inc', 'currency': 'USD', 'isin': 'US0079031078', 'bloomberg_ticker': 'AMD UQ', 'fixing_price_original': '164.1500'}, {'name': 'Alphabet Inc (Class A)', 'currency': 'USD', 'isin': 'US02079K3059', 'bloomberg_ticker': 'GOOGL UQ', 'fixing_price_original': '184.7700'}, {'name': 'NVIDIA Corp', 'currency': 'USD', 'isin': 'US67066G1040', 'bloomberg_ticker': 'NVDA UQ', 'fixing_price_original': '121.6000'}] ----- claude [{'name': 'NVIDIA CORP', 'currency': 'USD', 'isin': '', 'bloomberg_ticker': 'NVDA UW', 'fixing_price_original': '121.6000'}, {'name': 'ADVANCED MICRO DEVICES', 'currency': 'USD', 'isin': '', 'bloomberg_ticker': 'AMD UW', 'fixing_price_original': '164.1500'}, {'name': 'ALPHABET INC-CL A', 'currency': 'USD', 'isin': '', 'bloomberg_ticker': 'GOOGL UW', 'fixing_price_original': '184.7700'}]
 initial_fixing_date is not correct lgt 2024-07-03 00:00:00 ----- cla

In [199]:
# Running json extraction for all

paths = ['test_files/XS2845792592.pdf', 'test_files/XS2915153717.pdf', 'test_files/XS2773440412.pdf', 'test_files/XS2682830307.pdf', 'test_files/CH1350121724.pdf']
prompt = 'I am passing you a list of images that come from a financial term sheet, this financial term sheet contains information relative to the financial asset in some pages, but some other pages contain only disclaimers statements of risk and other legal stuff, can you please send me back an array of the index of the images that contain information relative to the asset, make sure to only pass me back the array of indexes and nothing else'
for path in paths:
    r = remove_disclaimers(prompt, path)

paths_clean = ['test_files/XS2845792592eng_clean.pdf', 'test_files/XS2915153717eng_clean.pdf', 'test_files/XS2773440412eng_clean.pdf', 'test_files/XS2682830307eng_clean.pdf', 'test_files/CH1350121724eng_clean.pdf']
prompt = f"""I have this json example {json.dumps(json_example)} and I give you a document containing information about a financial asset, can you please replace the fields and output me back the json with the information of the right document filled, make sure to only give me the json so I can parse it properly, do not put break lines, also take into account that the values of 'first_autocall_observation_date': '17 January 2025',
 'autocall_frequency': 'Monthly' or 'Daily'
 'autocall_observation_style': 'End of Period' or 'Daily Close',
 'coupon_frequency': 'Monthly' or 'Daily'
 """
for path_clean in paths_clean:
    json_claude, r = get_json(path_clean,prompt)
    with open(f'test_files/sample_output/{path_clean[11:-13]}_parsed.json', 'w', encoding='utf-8') as f:
        json.dump(json_claude, f, ensure_ascii=False, indent=4)

{'id': 'msg_bdrk_018TL7pLrXXzDfPSwzUWjgrU', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-5-sonnet-20240620', 'content': [{'type': 'text', 'text': '[0, 1, 2, 3, 4]'}], 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 12523, 'output_tokens': 18}}
{'id': 'msg_bdrk_015wqtxnq5Xr4xNFBaVUHHao', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-5-sonnet-20240620', 'content': [{'type': 'text', 'text': '[0, 1, 2, 3, 4]'}], 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 15631, 'output_tokens': 18}}
{'id': 'msg_bdrk_011jKnbf1W9rt4z9waVCtgma', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-5-sonnet-20240620', 'content': [{'type': 'text', 'text': '[0, 1, 2, 3]'}], 'stop_reason': 'end_turn', 'stop_sequence': None, 'usage': {'input_tokens': 13582, 'output_tokens': 15}}
{'id': 'msg_bdrk_01E4NAhB1oxB7LsApeDed1p3', 'type': 'message', 'role': 'assistant', 'model': 'claude-3-5-sonnet-20240620', 'content': [{'type': '

In [None]:
# Serving 

# Here I would have like to expose a fastapi with a swagger via ngrok