In [9]:
import anthropic
import base64
import os
import pandas as pd
import io
import time
import re
import json

import config

## Step one: Initialize Anthropic API client

In [10]:
# set anthropic API key
api_key = config.anthropic_api_key

In [11]:
client = anthropic.Anthropic(api_key = api_key)

## Step two: Compose prompt segments + define prompt function

Below, we define a prompt function that will request a response from the Claude genAI model via the Anthropic API. Keep in mind the following three pieces of the prompt:

* The user message
* The system message
* The assistant prefix

More information via the [Anthropic API technical documentation](https://docs.anthropic.com/en/api/messages)

Note that in order to accomodate different possible `value` columns, we are using a function parameter to define the user message based on the year.

In [12]:
assistant_prefix = '''[
    {
        "DESCRIPCION":'''

In [17]:
# take in an image (e.g. from a PDF page); query claude API; return table
def compose_image_message(image_path, year, type_of_data = "import", assistant_pre = ""):
    # read image
    with open(image_path, "rb") as f:
        image_data = f.read()
        base64_image = base64.b64encode(image_data).decode()

    # set our three prompt components:
    # first, set a value column to use based on the year
    if year < 1964:
        value_type = "DOLARES"
    elif year == 1964:
        value_type = "PESOS CA"
    else:
    	value_type = "PESOS CENTROAMERICANOS"

    # now define our three components
    user_message = "For the image included, please read this historical table and return a JSON list that contains one dictionary per row. Anazlyze every single row you see and follow the next instructions to create a JSON list. The output should include the following three keys: DESCRIPCION, PAIS, and VALOR.  Please format each value in the row as follows. For the DESCRIPCION value, include only the string of numbers and spaces that appear in the DESCRIPCION table column - this value looks something like 051 07 02 00. Treat this data like a string and include leading zeros and spaces. Null/blank values are acceptable - return them as empty strings. For the PAIS value, please return exactly the PAIS value (examples: HONDURAS, U A E). Double check that you have the PAIS string exactly right - double check the letters and respect the sequence. Include every single row that contains a PAIS value. For VALOR, please return the value in the " + value_type + " column. Please double-check your accuracy scanning this number."

    # now, compose our message

    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text":  user_message


                },
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": base64_image
                    }
                }
            ]
        },
        {
            "role": "assistant",
            "content": assistant_pre }
    ]

    return messages

In [18]:
# set a system prompt for claude
system_prompt = "You are a historian of economics data, looking to create detailed, error-free JSON list data based on scanned typerwriter tables. Only include the JSON list in your result. Return the complete results, and make sure to properly close the JSON list in your response."

In [19]:
def submit_image_prompt(message, api_client, system, assistant_pre):
    # send message

    response = api_client.messages.create(
        model="claude-3-5-sonnet-20241022",
        messages=message,
        system=system,
        max_tokens=8192
    )


    # return just the text response
    text_content = response.content[0].text


    # we must manually add the JSON prefix text back
    text_content_final = assistant_pre + text_content
    

    # now return the output
    text_content_final

    return text_content_final


Note that for this project, we need to conditionally modify the message for a key column (`values`) based on the `year` value in the data.

## Step three: Test the prompt and parse the result as JSON

In [22]:
# test

# test with existing image
start_time = time.time()
test_message = compose_image_message("images_import_only/file1960_page40.jpg", year = 1960, type_of_data = "import", assistant_pre = assistant_prefix)
test_output = submit_image_prompt(test_message, client, system_prompt, assistant_prefix)
print(f"Time taken: {time.time() - start_time} seconds")

Time taken: 14.299123287200928 seconds


In [23]:
test_output

'[\n    {\n        "DESCRIPCION":  "048 01 01 03",\n        "PAIS": "E U A",\n        "VALOR": 239824\n    },\n    {\n        "DESCRIPCION": "048 01 01 03",\n        "PAIS": "HOLANDA",\n        "VALOR": 29192\n    },\n    {\n        "DESCRIPCION": "048 01 01 03",\n        "PAIS": "CANADA",\n        "VALOR": 1008\n    },\n    {\n        "DESCRIPCION": "048 01 01 03",\n        "PAIS": "ALEMANOC",\n        "VALOR": 478\n    },\n    {\n        "DESCRIPCION": "048 01 01 03",\n        "PAIS": "MEXICO",\n        "VALOR": 541\n    },\n    {\n        "DESCRIPCION": "048 01 01 03",\n        "PAIS": "REINOUNI",\n        "VALOR": 291\n    },\n    {\n        "DESCRIPCION": "048 01 01 03",\n        "PAIS": "ESPAÑA",\n        "VALOR": 169\n    },\n    {\n        "DESCRIPCION": "048 01 01 04",\n        "PAIS": "ALEMANOC",\n        "VALOR": 3243\n    },\n    {\n        "DESCRIPCION": "048 01 01 04",\n        "PAIS": "HOLANDA",\n        "VALOR": 394\n    },\n    {\n        "DESCRIPCION": "048 01 01 04",

In [24]:
# test the output - does it behave like JSON?
json.loads(test_output)

[{'DESCRIPCION': '048 01 01 03', 'PAIS': 'E U A', 'VALOR': 239824},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'HOLANDA', 'VALOR': 29192},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'CANADA', 'VALOR': 1008},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'ALEMANOC', 'VALOR': 478},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'MEXICO', 'VALOR': 541},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'REINOUNI', 'VALOR': 291},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'ESPAÑA', 'VALOR': 169},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'ALEMANOC', 'VALOR': 3243},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'HOLANDA', 'VALOR': 394},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'E U A', 'VALOR': 338},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'ESPAÑA', 'VALOR': 215},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'ARGENTIN', 'VALOR': 161},
 {'DESCRIPCION': '048 01 01 05', 'PAIS': 'NICARAGU', 'VALOR': 7},
 {'DESCRIPCION': '048 01 02 00', 'PAIS': 'E U A', 'VALOR': 42086},
 {'DESCRIPCION': '048 01 02 00', 'PAIS': 'DINAMARC', 'VALOR'

In [25]:
# Try to convert / regularize as a dataframe table
test_output_as_pd = pd.DataFrame(json.loads(test_output))

In [26]:
test_output_as_pd

Unnamed: 0,DESCRIPCION,PAIS,VALOR
0,048 01 01 03,E U A,239824
1,048 01 01 03,HOLANDA,29192
2,048 01 01 03,CANADA,1008
3,048 01 01 03,ALEMANOC,478
4,048 01 01 03,MEXICO,541
5,048 01 01 03,REINOUNI,291
6,048 01 01 03,ESPAÑA,169
7,048 01 01 04,ALEMANOC,3243
8,048 01 01 04,HOLANDA,394
9,048 01 01 04,E U A,338


## Step four (optional): Validate the output with a second prompt

In [47]:
# take in an image (e.g. from a PDF page); query claude API; return table
def validate_response_second_image(image_path, year = 1960, first_pass = []):
    # read image
    with open(image_path, "rb") as f:
        image_data = f.read()
        base64_image = base64.b64encode(image_data).decode()
    # construct messsage for API
    # remove negative language
    # for each row, extract the following information
    # in your response, write ITEM: COUNTRY: VALUE:
    # get more responses, THEN use regex to extract values and structure as table.
    # parse as JSON as an intermediary step
    # batch!

    if year < 1964:
        value_type = "DOLARES"
    elif year == 1964:
        value_type = "PESOS CA"
    else:
    	value_type = "PESOS CENTROAMERICANOS"

    messages = [
        {
            "role": "user",

            "role": "user",
            "content": [
                 {
                    "type": "text",
                    "text":   "I want you to validate the data you previously extracted from a scanned table. To do this, I'm going to send you the first pass data as a JSON list (list of dictionaries). Here is that data: " + first_pass + "   Ok. Done. Now, I'm going to send you the original image of the historical trade data table: "


                },
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": base64_image
                    }
                },
                                     {
                    "type": "text",
                    "text":   "Now I want you to compare the table and the image. What digits and words did you mix up? I want you identify every difference betwen the JSON list and the image. For every record in this JSON list, compare all three values (DESCRIPCION, PAIS, and VALOR) to the image ,where they should appear in the DESCRIPCION, PAIS, and " + value_type + " columns. Wherever you misread a digit, number, or string, fix your mistakes. Now, give me your updated version of this data as a JSON list. No other information is necessary."


                }
            ]
        },
        {
            "role": "assistant",
            "content":'''[
    {
        "DESCRIPCION":''' }
    ]

    # send message

    response = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        messages=messages,
        system="You are a computational historian who is looking to very very accurately review and update historical data.",
        max_tokens=8192
    )


    text_content = response.content[0].text


    # we must manually add the JSON prefix text back
    text_content_final = '''[
    {
        "DESCRIPCION":''' + text_content
    
 
    text_content_final

    return text_content_final

In [48]:
# test out our validation

# test with existing image
start_time = time.time()
test_second_pass_output = validate_response_second_image("images_import_only/file1960_page40.jpg", year = 1960, first_pass = test_output)
print(f"Time taken: {time.time() - start_time} seconds")

Time taken: 14.428977727890015 seconds


In [49]:
json.loads(test_second_pass_output)

[{'DESCRIPCION': '048 01 01 03', 'PAIS': 'E U A', 'VALOR': 235424},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'HOLANDA', 'VALOR': 22123},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'CANADA', 'VALOR': 1008},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'ALEMANOC', 'VALOR': 478},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'FRANCIA', 'VALOR': 348},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'REINOUNI', 'VALOR': 291},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'ESPAÑA', 'VALOR': 169},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'ALEMANOC', 'VALOR': 3242},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'POLONIA', 'VALOR': 394},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'E U A', 'VALOR': 358},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'ESPAÑA', 'VALOR': 215},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'ARGENTIN', 'VALOR': 161},
 {'DESCRIPCION': '048 01 01 05', 'PAIS': 'NICARAGU', 'VALOR': 7},
 {'DESCRIPCION': '048 01 02 00', 'PAIS': 'E U A', 'VALOR': 42086},
 {'DESCRIPCION': '048 01 02 00', 'PAIS': 'DINAMARC', 'VALOR

In [50]:
json.loads(test_output)

[{'DESCRIPCION': '048 01 01 03', 'PAIS': 'E U A', 'VALOR': 235424},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'HOLANDA', 'VALOR': 22123},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'CANADA', 'VALOR': 1008},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'ALEMANOC', 'VALOR': 478},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'FRANCIA', 'VALOR': 348},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'REINOUNI', 'VALOR': 291},
 {'DESCRIPCION': '048 01 01 03', 'PAIS': 'ESPAÑA', 'VALOR': 169},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'ALEMANOC', 'VALOR': 3242},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'POLONIA', 'VALOR': 394},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'E U A', 'VALOR': 358},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'ESPAÑA', 'VALOR': 215},
 {'DESCRIPCION': '048 01 01 04', 'PAIS': 'ARGENTIN', 'VALOR': 161},
 {'DESCRIPCION': '048 01 01 05', 'PAIS': 'NICARAGU', 'VALOR': 7},
 {'DESCRIPCION': '048 01 02 00', 'PAIS': 'E U A', 'VALOR': 42086},
 {'DESCRIPCION': '048 01 02 00', 'PAIS': 'DINAMARC', 'VALOR

## Step five: Run workflow on complete dataset

In [None]:
# create metadata based on import/export images in folders
# this metadata will drive the querying process next

all_import_images = os.listdir('images_import_only')
all_export_images =  os.listdir('images_export_only')

all_image_metadata = []

In [None]:
for image_filename in all_import_images:
    try:
        year = int(image_filename[4:8])
        image_page = int(image_filename[13:-4])
        current_image = {'year': year
                         ,'page': image_page
                         ,'source_type': 'import'
                         ,'filename': image_filename
                         ,'full_path': 'images_import_only/' + image_filename}
        all_image_metadata.append(current_image)
    except:
        pass

for image_filename in all_export_images:
    try:
        year = int(image_filename[4:8])
        image_page = int(image_filename[13:-4])
        current_image = {'year': year
                         ,'page': image_page
                         ,'source_type': 'export'
                         ,'filename': image_filename
                         ,'full_path': 'images_export_only/' + image_filename}
        all_image_metadata.append(current_image)
    except:
        pass

                

In [None]:
all_images_df = pd.DataFrame(all_image_metadata).sort_values(['source_type', 'year', 'page'], ascending=[False, True, True])
all_images_df

In [None]:
# ALL EXPORTS LET US TRY

for image in all_images_df[all_images_df["source_type"] == "export"].to_dict(orient="records"):
    image_full_path = image["full_path"]
    image_filename = image["filename"]
    image_year = int(image["year"])
    image_page = image["page"]
    start_time = time.time()
    test_output = create_dataframe_from_image(image_full_path, image_year)
    print(f"Time taken: {time.time() - start_time} seconds")
    # try to parse the JSON and conver to csv. if it works, save as CSV. if not, save as raw TXT
    try:
        parsed_json = json.loads(test_output)
        final_dataframe = pd.DataFrame(parsed_json)
        output_filename = image_filename[0:-4]
        output_filename = output_filename + "_export.csv"
        final_dataframe["year"] = image_year
        final_dataframe["pdf_image_page"] = image_page
        final_dataframe.to_csv("data_export_only/" + output_filename, index=False)
        print("success for " + image_filename + " response")
    except:
        print("parsing error! will save as text.")
        output_filename = image_filename[0:-4]
        output_filename = output_filename + "_export_parsefail.txt"
        with open("data_export_only/" + output_filename, "w") as text_file:
            text_file.write(test_output)
        print("written as raw text.")

    time.sleep(5)


In [None]:
# add slight tweaked version for imports here!!!