In [None]:
%pip install PyMuPDF
%pip install anthropic pdf2image
%pip install IPython

In [None]:
import base64
from anthropic import Anthropic
from IPython.display import Image
import pdf2image
import fitz
import io
import requests
from PIL import Image
import base64
from google.colab import userdata
import os
import glob

client = Anthropic(api_key=userdata.get('ANTHROPIC_API_KEY'))
MODEL_NAME = "claude-3-opus-20240229"

In [None]:
# Make a useful helper function.
def get_completion(messages):
    response = client.messages.create(
        model=MODEL_NAME,
        max_tokens=2048,
        temperature=0,
        messages=messages
    )
    return response.content[0].text

In [None]:
# Define the function to convert a pdf slide deck to a list of images.
# Note that we need to ensure we resize images to keep them within Claude's size limits.
# Taken from Anthropic Cookbook: https://github.com/anthropics/anthropic-cookbook/blob/main/multimodal/reading_charts_graphs_powerpoints.ipynb

def pdf_to_base64_pngs(pdf_path, # URL,
                       quality=75, max_size=(1024, 1024)):
    # Open the PDF file
    doc = fitz.open(pdf_path)

    # The two lines below are for PDFs hosted online
    #res = requests.get(URL)
    #doc = fitz.open(stream = res.content, filetype="pdf")


    # Iterate through each page of the PDF
    for page_num in range(doc.page_count):
        # Load the page
        page = doc.load_page(page_num)

        # Render the page as a PNG image
        pix = page.get_pixmap(matrix=fitz.Matrix(300/72, 300/72))

        # Save the PNG image
        output_path = f"./sample_output/page_{page_num+1}.png"
        pix.save(output_path)

    # Convert the PNG images to base64 encoded strings
    images = [Image.open(f"./sample_output/page_{page_num+1}.png") for page_num in range(doc.page_count)]
    # Close the PDF document
    doc.close()

    base64_encoded_pngs = []

    for image in images:
        # Resize the image if it exceeds the maximum size
        if image.size[0] > max_size[0] or image.size[1] > max_size[1]:
            image.thumbnail(max_size, Image.Resampling.LANCZOS)
        image_data = io.BytesIO()
        image.save(image_data, format='PNG', optimize=True, quality=quality)
        image_data.seek(0)
        base64_encoded = base64.b64encode(image_data.getvalue()).decode('utf-8')
        base64_encoded_pngs.append(base64_encoded)

    return base64_encoded_pngs



In [None]:
# This is the prompt given to Claude, along with the slides from the APG update.

question = """Pull the tables from the pages in the attached documents that say 'Tracking the goal', beginning with the row that says 'We will...'

Keep the content the same. Do not add information. Do not paraphrase. The column headers for the output are 'Achievement statement', 'Name of Measure', 'Start value', 'As of (Date)', 'Target value', 'Current value', 'As of (Date)', 'Update cycle', and 'Footnotes'. Do not include the column headers in the output.

The 'Footnotes' column is the rightmost column of the table. Copy any relevant footnotes that appear on the Goal Target(s) page. A footnote may be relevant to multiple rows, or just one. If there is a superscript number in a cell of the table, referring to a footnote, use the ^ symbol to indicate its location in the original table, instead of the superscript number. If there are no relevant footnotes for that row, leave the cell blank.

The Goal tables may appear on different pages; if they appear on different pages, extract all of them.

If a column is missing in the document, leave it empty in the output table. Each row needs to have the same columns.

If there is a "By" column with a date, you can append that to the Achievement Statement. Otherwise, do not edit information.

After that, add one column to the left which has the goal number. If the original document has a goal number included already, put that in the output. Otherwise, create a consecutive, unique goal number for each row, e.g. 1, 2, 3.

Print the output as a CSV, without any additional text. Ensure there's a new line \n character after every row, including the final row. The cells should be separated by this symbol: '|'

"""


In [None]:
# Add filename to the output before loading into file containing
# the extracted goals

def add_column_to_csv(csv_string, new_column_value):
    # Split the string into rows
    rows = csv_string.split('\n')

    # Process each row
    new_rows = []
    for row in rows:
        # Split the row into columns
        columns = row.split('|')

        # Add the new value to the beginning of the row
        new_row = [new_column_value] + columns

        # Join the columns back into a row
        new_rows.append('|'.join(new_row))

    # Join the rows back into a single string
    return '\n'.join(new_rows)

In [None]:
reports = glob.glob('./FY2024 Q2 reports/*')
for report_filepath in reports:
    file_path_components = report_filepath.split('/')
    file_name_and_extension = file_path_components[-1].rsplit('.', 1)
    reportname = file_name_and_extension[0]

    # Clear encoded PNGs from previous report that was processed
    files = glob.glob('./sample_output/*')
    for f in files:
      os.remove(f)

    print(report_filepath)

    encoded_pngs = pdf_to_base64_pngs(pdf_path = report_filepath)

    content = [{"type": "image", "source": {"type": "base64", "media_type": "image/png", "data": encoded_png}} for encoded_png in encoded_pngs[2:10]]

    content.append({"type": "text", "text": question})
    messages = [
        {
          "role": 'user',
          "content": content
        }
    ]

    output = get_completion(messages)
    print(output)

    new_csv_data = add_column_to_csv(output, reportname)

    with open("./all_goals.txt", "a") as myfile:
      myfile.write(new_csv_data)


./FY2024 Q2 reports/FY2024_Q2_NSF_Progress_Improve_Representation_in_the_Scientific_Enterprise.pdf
Goal Number|Achievement statement|Name of Measure|Start value|As of (Date)|Target value|Current value|As of (Date)|Update cycle|Footnotes
1|Increase the proportion of proposals with principal investigators from groups underrepresented in STEM by 10% over the FY 2022 baseline.|Proportion of investigator proposals (%)|37.4%|9/30/22|41.1%|37.5%|9/30/23|Annually|
2|Increase the proportion of proposals from emerging research institutions by 10% over the FY 2022 baseline.|Proportion of institution proposals (%)|24.2%|9/30/22|26.6%|25.1%|9/30/23|Annually|^Data pulled from draft APG dashboard on 1/17/2024.|^For this goal, women are defined as those who select "female" in demographic data collection. Gender breakdown varies significantly by discipline, but overall, across all of the science and engineering fields that NSF supports, less than 30 percent of proposals come from female investigators, 