In [1]:
import anthropic
import base64
import os
import pandas as pd
import io
import time
import re
import json

import config

In [2]:
# set anthropic API key
api_key = config.anthropic_api_key

## Step one: initialize Anthropic API client

In [3]:
client = anthropic.Anthropic(api_key = api_key)

## Step two: Define querying/prompting function

In [None]:
# take in an image (e.g. from a PDF page); query claude API; return table
def create_jsonl_response_from_image(image_path, year = 1960):
    # read image
    with open(image_path, "rb") as f:
        image_data = f.read()
        base64_image = base64.b64encode(image_data).decode()
    # construct messsage for API
    # remove negative language
    # for each row, extract the following information
    # in your response, write ITEM: COUNTRY: VALUE:
    # get more responses, THEN use regex to extract values and structure as table.
    # parse as JSON as an intermediary step
    # batch!

    if year < 1964:
        value_type = "PESOS"
    else:
        value_type = "DOLARES"
    messages = [
        {
            "role": "user",
            "content": [
                {
                    "type": "text",
                    "text": "For the image included, please read this historical table and return a JSON list that contains one dictionary for every row of the table The output should include the following three keys: DESCRIPCION, PAIS, and VALORES and the values should come from every row in the original table. Every row in the original data should appear in the output JSON list. Here are the rows for each key. For the Descripcion value, please only include the string of numbers and spaces that appear in the DESCRIPCION table column - this value looks something like 051 07 02 00. Treat this data like a string and include leading zeros and spaces. Please note that every JSON dictionary object must include a DESCRIPCION column value. When reading the original historical table image, you must fill in the blank DESCRIPCION values for each row that contains only a PAIS value - these PAIS values are all sub-items of a main row and so you can fill in the DESCRIPCION value with the first value you see above those rows with only PAIS values. The PAIS value is either missing (which is ok) or the value in that column, like NICARAGUA, HONDURAS, etc. For VALORES, use the value in the " + value_type + " column."
                },
                {
                    "type": "image",
                    "source": {
                        "type": "base64",
                        "media_type": "image/jpeg",
                        "data": base64_image
                    }
                }
            ]
        },
        {
            "role": "assistant",
            "content":'''[
    {
        "DESCRIPCION":''' }
    ]

    # send message

    response = client.messages.create(
        model="claude-3-5-sonnet-20241022",
        messages=messages,
        system="You are a historian of economics data, looking to create detailed, error-free JSON list data based on scanned typerwriter tables. Please double-check that all values appear in the correct columns and rows, and that every individual digit has been digitized correctly. Include a complete list of everything you detected in the historical table in your response. Only include the JSON list in your result. Return the complete results, and make sure to properly close the JSON list in your response.",
        max_tokens=8192
    )


    text_content = response.content[0].text


    # we must manually add the JSON prefix text back
    text_content_final = '''[
    {
        "DESCRIPCION":''' + text_content
    
 
    text_content_final

    return text_content_final

In [None]:
# test

# test with existing image
start_time = time.time()
test_output = create_dataframe_from_image("images_import_only/file1960_page40.jpg")
print(f"Time taken: {time.time() - start_time} seconds")

## Step three: Run workflow

In [None]:
# create metadata based on import/export images in folders
# this metadata will drive the querying process next

all_import_images = os.listdir('images_import_only')
all_export_images =  os.listdir('images_export_only')

all_image_metadata = []

In [None]:
for image_filename in all_import_images:
    try:
        year = int(image_filename[4:8])
        image_page = int(image_filename[13:-4])
        current_image = {'year': year
                         ,'page': image_page
                         ,'source_type': 'import'
                         ,'filename': image_filename
                         ,'full_path': 'images_import_only/' + image_filename}
        all_image_metadata.append(current_image)
    except:
        pass

for image_filename in all_export_images:
    try:
        year = int(image_filename[4:8])
        image_page = int(image_filename[13:-4])
        current_image = {'year': year
                         ,'page': image_page
                         ,'source_type': 'export'
                         ,'filename': image_filename
                         ,'full_path': 'images_export_only/' + image_filename}
        all_image_metadata.append(current_image)
    except:
        pass

                

In [None]:
all_images_df = pd.DataFrame(all_image_metadata).sort_values(['source_type', 'year', 'page'], ascending=[False, True, True])
all_images_df

In [None]:
# ALL EXPORTS LET US TRY

for image in all_images_df[all_images_df["source_type"] == "export"].to_dict(orient="records"):
    image_full_path = image["full_path"]
    image_filename = image["filename"]
    image_year = int(image["year"])
    image_page = image["page"]
    start_time = time.time()
    test_output = create_dataframe_from_image(image_full_path, image_year)
    print(f"Time taken: {time.time() - start_time} seconds")
    # try to parse the JSON and conver to csv. if it works, save as CSV. if not, save as raw TXT
    try:
        parsed_json = json.loads(test_output)
        final_dataframe = pd.DataFrame(parsed_json)
        output_filename = image_filename[0:-4]
        output_filename = output_filename + "_export.csv"
        final_dataframe["year"] = image_year
        final_dataframe["pdf_image_page"] = image_page
        final_dataframe.to_csv("data_export_only/" + output_filename, index=False)
        print("success for " + image_filename + " response")
    except:
        print("parsing error! will save as text.")
        output_filename = image_filename[0:-4]
        output_filename = output_filename + "_export_parsefail.txt"
        with open("data_export_only/" + output_filename, "w") as text_file:
            text_file.write(test_output)
        print("written as raw text.")

    time.sleep(5)


In [None]:
# add slight tweaked version for imports here!!!