In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os
import sys

Append the `grt_reader` repo so we can read modules from sibling directories.


In [3]:
sys.path.append(os.path.abspath(os.path.join("..")))

In [4]:
import numpy as np
import pandas as pd
from utils.constants import question_ids_map
from utils.questions import (
    construct_single_question_for_ai,
)

from notebooks.utils.process_lease import process_lease

  from tqdm.autonotebook import tqdm


In [5]:
bucket_name = os.getenv("BUCKET_NAME", "")
index_name = os.getenv("INDEX_NAME", "")

In [6]:
print(f"Bucket name: {bucket_name}")
print(f"Index name: {index_name}")

Bucket name: gaaprt-reader-dev
Index name: development-gaaprt-reader-openai


Retrieve questions


In [7]:
questions_df = pd.read_excel("../data/asc_842/grt_ai_reader_questions_v1.xlsx")
questions_df = questions_df.drop(["encoding"], axis=1)
questions_df = questions_df.replace({np.nan: None})

Retrieve answers file


In [8]:
answers_df = pd.read_excel("../data/asc_842/lease_agreements_info/answers.xlsx")

Get questions to send to the AI


In [9]:
question_id = question_ids_map["commencement_date"]
question_df_filtered = questions_df[questions_df["id"] == question_id]
question_as_json = question_df_filtered.to_dict(orient="records")[0]
question_for_ai = construct_single_question_for_ai(question_as_json)

## Calculating the accuracy for all the leases


In [10]:
numbers_list = [str(i).zfill(3) for i in range(1, 101)]

## Experiment 1: gpt-3.5-turbo, text-embedding-ada-002, unstructured outputs


In [None]:
total_answers = 0
correct_answers = 0
for number in numbers_list:
    file_name = f"lease{number}"
    local_file = f"../data/asc_842/lease_agreements/{file_name}.pdf"
    file_key = f"eafit/{file_name}.pdf"
    namespace = f"eafit_{file_name}"

    # Check if the real answer is NaT
    answer_is_not_given = pd.isna(
        answers_df[answers_df["Lease"] == file_name][question_id].iloc[0]
    )

    if answer_is_not_given:
        continue

    total_answers += 1

    try:
        result = process_lease(
            local_file=local_file,
            file_name=file_name,
            file_key=file_key,
            namespace=namespace,
            question_for_ai=question_for_ai,
            question_id=question_id,
            answers_df=answers_df,
        )
        answer = result["answer"]
        real_answer_unprocessed = result["real_answer_unprocessed"]
        formatted_answer = real_answer_unprocessed.iloc[0].strftime("%Y-%m-%d")
        print(f"Answer: {answer}")
        print(f"Real answer: {formatted_answer}")
        if answer == formatted_answer:
            correct_answers += 1
            print("CORRECT")
        else:
            print("INCORRECT")
        print("----------------------------------------------------")
    except Exception as error:
        print(f"Error processing {file_name}: {error}")
        continue
accuracy = correct_answers / total_answers
accuracy

## Experiment 2: gpt-4o-mini, text-embedding-3-small, structured outputs


In [23]:
total_answers = 0
correct_answers = 0
for number in numbers_list:
    file_name = f"lease{number}"
    local_file = f"../data/asc_842/lease_agreements/{file_name}.pdf"
    file_key = f"eafit/{file_name}.pdf"
    namespace = f"eafit_{file_name}"

    # Check if the real answer is NaT
    answer_is_not_given = pd.isna(
        answers_df[answers_df["Lease"] == file_name][question_id].iloc[0]
    )

    if answer_is_not_given:
        continue

    total_answers += 1

    try:
        result = process_lease(
            local_file=local_file,
            file_name=file_name,
            file_key=file_key,
            namespace=namespace,
            question_for_ai=question_for_ai,
            question_id=question_id,
            answers_df=answers_df,
            model="gpt-4o-mini",
            embedding_model="text-embedding-3-small",
        )
        answer = result["answer"]
        real_answer_unprocessed = result["real_answer_unprocessed"]
        formatted_answer = real_answer_unprocessed.iloc[0].strftime("%Y-%m-%d")
        print(f"Answer: {answer.answer_date}")
        print(f"Real answer: {formatted_answer}")
        if answer.answer_date == formatted_answer:
            correct_answers += 1
            print("CORRECT")
        else:
            print("INCORRECT")
        print("----------------------------------------------------")
    except Exception as error:
        print(f"Error processing {file_name}: {error}")
        continue
accuracy = correct_answers / total_answers
accuracy

Upload Successful: eafit/lease001.pdf
Stored eafit/lease001.pdf in vector database with namespace eafit_lease001
Answer: 
Real answer: 2022-01-25
INCORRECT
----------------------------------------------------
Upload Successful: eafit/lease002.pdf
Stored eafit/lease002.pdf in vector database with namespace eafit_lease002
Answer: 2024-03-01
Real answer: 2024-03-01
CORRECT
----------------------------------------------------
Upload Successful: eafit/lease004.pdf
Stored eafit/lease004.pdf in vector database with namespace eafit_lease004
Answer: 
Real answer: 2010-10-01
INCORRECT
----------------------------------------------------
Upload Successful: eafit/lease005.pdf
Stored eafit/lease005.pdf in vector database with namespace eafit_lease005
Answer: 2021-06-01
Real answer: 1995-04-01
INCORRECT
----------------------------------------------------
Upload Successful: eafit/lease006.pdf
Stored eafit/lease006.pdf in vector database with namespace eafit_lease006
Answer: 
Real answer: 2021-08-01


0.5955056179775281

## Experiment 3: gpt-4o-2024-08-06, text-embedding-3-small, structured outputs


In [13]:
total_answers = 0
correct_answers = 0
for number in numbers_list:
    file_name = f"lease{number}"
    local_file = f"../data/asc_842/lease_agreements/{file_name}.pdf"
    file_key = f"eafit/{file_name}.pdf"
    namespace = f"eafit_{file_name}"

    # Check if the real answer is NaT
    answer_is_not_given = pd.isna(
        answers_df[answers_df["Lease"] == file_name][question_id].iloc[0]
    )

    if answer_is_not_given:
        continue

    total_answers += 1

    try:
        result = process_lease(
            local_file=local_file,
            file_name=file_name,
            file_key=file_key,
            namespace=namespace,
            question_for_ai=question_for_ai,
            question_id=question_id,
            answers_df=answers_df,
            model="gpt-4o-2024-08-06",
            embedding_model="text-embedding-3-small",
        )
        answer = result["answer"]
        real_answer_unprocessed = result["real_answer_unprocessed"]
        formatted_answer = real_answer_unprocessed.iloc[0].strftime("%Y-%m-%d")
        print(f"Answer: {answer.answer_date}")
        print(f"Real answer: {formatted_answer}")
        if answer.answer_date == formatted_answer:
            correct_answers += 1
            print("CORRECT")
        else:
            print("INCORRECT")
        print("----------------------------------------------------")
    except Exception as error:
        print(f"Error processing {file_name}: {error}")
        continue
accuracy = correct_answers / total_answers
accuracy

Upload Successful: eafit/lease001.pdf
Stored eafit/lease001.pdf in vector database with namespace eafit_lease001
Answer: 2022-01-25
Real answer: 2022-01-25
CORRECT
----------------------------------------------------
Upload Successful: eafit/lease002.pdf
Stored eafit/lease002.pdf in vector database with namespace eafit_lease002
Answer: 2024-04-01
Real answer: 2024-03-01
INCORRECT
----------------------------------------------------
Upload Successful: eafit/lease004.pdf
Stored eafit/lease004.pdf in vector database with namespace eafit_lease004
Answer: 2010-10-01
Real answer: 2010-10-01
CORRECT
----------------------------------------------------
Upload Successful: eafit/lease005.pdf
Stored eafit/lease005.pdf in vector database with namespace eafit_lease005
Answer: 2005-04-01
Real answer: 1995-04-01
INCORRECT
----------------------------------------------------
Upload Successful: eafit/lease006.pdf
Stored eafit/lease006.pdf in vector database with namespace eafit_lease006
Answer: None
Re

0.651685393258427

## Experiment 4: gpt-3.5-turbo, text-embedding-3-small, JSON mode


In [12]:
total_answers = 0
correct_answers = 0
for number in numbers_list:
    file_name = f"lease{number}"
    local_file = f"../data/asc_842/lease_agreements/{file_name}.pdf"
    file_key = f"eafit/{file_name}.pdf"
    namespace = f"eafit_{file_name}"

    # Check if the real answer is NaT
    answer_is_not_given = pd.isna(
        answers_df[answers_df["Lease"] == file_name][question_id].iloc[0]
    )

    if answer_is_not_given:
        continue

    total_answers += 1

    try:
        result = process_lease(
            local_file=local_file,
            file_name=file_name,
            file_key=file_key,
            namespace=namespace,
            question_for_ai=question_for_ai,
            question_id=question_id,
            answers_df=answers_df,
            force_structured_output=True,
            model="gpt-3.5-turbo",
            embedding_model="text-embedding-3-small",
        )
        answer = result["answer"]
        real_answer_unprocessed = result["real_answer_unprocessed"]
        formatted_answer = real_answer_unprocessed.iloc[0].strftime("%Y-%m-%d")
        print(f"Answer: {answer}")
        print(f"Real answer: {formatted_answer}")
        if answer == formatted_answer:
            correct_answers += 1
            print("CORRECT")
        else:
            print("INCORRECT")
        print("----------------------------------------------------")
    except Exception as error:
        print(f"Error processing {file_name}: {error}")
        continue
accuracy = correct_answers / total_answers
accuracy

Upload Successful: eafit/lease001.pdf
Stored eafit/lease001.pdf in vector database with namespace eafit_lease001
Answer: 2022-01-25
Real answer: 2022-01-25
CORRECT
----------------------------------------------------
Upload Successful: eafit/lease002.pdf
Stored eafit/lease002.pdf in vector database with namespace eafit_lease002
Answer: 2024-04-01
Real answer: 2024-03-01
INCORRECT
----------------------------------------------------
Upload Successful: eafit/lease004.pdf
Stored eafit/lease004.pdf in vector database with namespace eafit_lease004
Answer: 2010-10-01
Real answer: 2010-10-01
CORRECT
----------------------------------------------------
Upload Successful: eafit/lease005.pdf
Stored eafit/lease005.pdf in vector database with namespace eafit_lease005
Answer: 2007-05-01
Real answer: 1995-04-01
INCORRECT
----------------------------------------------------
Upload Successful: eafit/lease006.pdf
Stored eafit/lease006.pdf in vector database with namespace eafit_lease006
Answer: 2023-01

0.5617977528089888

## Experiment 5: gpt-3.5-turbo, text-embedding-3-large


In [11]:
total_answers = 0
correct_answers = 0
for number in numbers_list:
    file_name = f"lease{number}"
    local_file = f"../data/asc_842/lease_agreements/{file_name}.pdf"
    file_key = f"eafit/{file_name}.pdf"
    namespace = f"eafit_{file_name}"

    # Check if the real answer is NaT
    answer_is_not_given = pd.isna(
        answers_df[answers_df["Lease"] == file_name][question_id].iloc[0]
    )

    if answer_is_not_given:
        continue

    total_answers += 1

    try:
        result = process_lease(
            local_file=local_file,
            file_name=file_name,
            file_key=file_key,
            namespace=namespace,
            question_for_ai=question_for_ai,
            question_id=question_id,
            answers_df=answers_df,
            embedding_model="text-embedding-3-large",
            model="gpt-3.5-turbo",
            index_name=os.getenv("INDEX_LATEST_NAME", ""),
        )
        answer = result["answer"]
        real_answer_unprocessed = result["real_answer_unprocessed"]
        formatted_answer = real_answer_unprocessed.iloc[0].strftime("%Y-%m-%d")
        print(f"Answer: {answer}")
        print(f"Real answer: {formatted_answer}")
        if answer == formatted_answer:
            correct_answers += 1
            print("CORRECT")
        else:
            print("INCORRECT")
        print("----------------------------------------------------")
    except Exception as error:
        print(f"Error processing {file_name}: {error}")
        continue
accuracy = correct_answers / total_answers
accuracy

Upload Successful: eafit/lease001.pdf
Stored eafit/lease001.pdf in vector database with namespace eafit_lease001
Answer: 2023-07-30
Real answer: 2022-01-25
INCORRECT
----------------------------------------------------
Upload Successful: eafit/lease002.pdf
Stored eafit/lease002.pdf in vector database with namespace eafit_lease002
Answer: 2024-04-01
Real answer: 2024-03-01
INCORRECT
----------------------------------------------------
Upload Successful: eafit/lease004.pdf
Stored eafit/lease004.pdf in vector database with namespace eafit_lease004
Answer: 2010-10-01
Real answer: 2010-10-01
CORRECT
----------------------------------------------------
Upload Successful: eafit/lease005.pdf
Stored eafit/lease005.pdf in vector database with namespace eafit_lease005
Answer: 2007-05-01
Real answer: 1995-04-01
INCORRECT
----------------------------------------------------
Upload Successful: eafit/lease006.pdf
Stored eafit/lease006.pdf in vector database with namespace eafit_lease006
Answer: 2023-

0.5168539325842697

## Experiment 6: gpt-4o-mini, text-embedding-3-large, structured outputs


In [12]:
total_answers = 0
correct_answers = 0
for number in numbers_list:
    file_name = f"lease{number}"
    local_file = f"../data/asc_842/lease_agreements/{file_name}.pdf"
    file_key = f"eafit/{file_name}.pdf"
    namespace = f"eafit_{file_name}"

    # Check if the real answer is NaT
    answer_is_not_given = pd.isna(
        answers_df[answers_df["Lease"] == file_name][question_id].iloc[0]
    )

    if answer_is_not_given:
        continue

    total_answers += 1

    try:
        result = process_lease(
            local_file=local_file,
            file_name=file_name,
            file_key=file_key,
            namespace=namespace,
            question_for_ai=question_for_ai,
            question_id=question_id,
            answers_df=answers_df,
            model="gpt-4o-mini",
            embedding_model="text-embedding-3-large",
            index_name=os.getenv("INDEX_LATEST_NAME", ""),
        )
        answer = result["answer"]
        real_answer_unprocessed = result["real_answer_unprocessed"]
        formatted_answer = real_answer_unprocessed.iloc[0].strftime("%Y-%m-%d")
        print(f"Answer: {answer.answer_date}")
        print(f"Real answer: {formatted_answer}")
        if answer.answer_date == formatted_answer:
            correct_answers += 1
            print("CORRECT")
        else:
            print("INCORRECT")
        print("----------------------------------------------------")
    except Exception as error:
        print(f"Error processing {file_name}: {error}")
        continue
accuracy = correct_answers / total_answers
accuracy

Upload Successful: eafit/lease001.pdf
Stored eafit/lease001.pdf in vector database with namespace eafit_lease001
Answer: YYYY-MM-DD
Real answer: 2022-01-25
INCORRECT
----------------------------------------------------
Upload Successful: eafit/lease002.pdf
Stored eafit/lease002.pdf in vector database with namespace eafit_lease002
Answer: 2024-03-01
Real answer: 2024-03-01
CORRECT
----------------------------------------------------
Upload Successful: eafit/lease004.pdf
Stored eafit/lease004.pdf in vector database with namespace eafit_lease004
Answer: 2010-10-01
Real answer: 2010-10-01
CORRECT
----------------------------------------------------
Upload Successful: eafit/lease005.pdf
Stored eafit/lease005.pdf in vector database with namespace eafit_lease005
Answer: 1995-04-01
Real answer: 1995-04-01
CORRECT
----------------------------------------------------
Upload Successful: eafit/lease006.pdf
Stored eafit/lease006.pdf in vector database with namespace eafit_lease006
Answer: 2023-10-0

0.6629213483146067