In [15]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [16]:
import os
import sys

Append the `grt_reader` repo so we can read modules from sibling directories.


In [17]:
sys.path.append(os.path.abspath(os.path.join("..")))

In [18]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz
from utils.constants import AvailableEmbeddingModels, AvailableModels, question_ids_map
from utils.process_lease import process_lease
from utils.questions import (
    construct_single_question_for_ai,
)

## Threshold for Levenshtein distance


In [19]:
threshold = 80

In [20]:
bucket_name = os.getenv("BUCKET_NAME", "")
index_name = os.getenv("INDEX_NAME", "")

In [21]:
print(f"Bucket name: {bucket_name}")
print(f"Index name: {index_name}")

Bucket name: gaaprt-reader-dev
Index name: development-gaaprt-reader-openai


Retrieve questions


In [22]:
questions_df = pd.read_excel("../data/asc_842/grt_ai_reader_questions_v1.xlsx")
questions_df = questions_df.drop(["encoding"], axis=1)
questions_df = questions_df.replace({np.nan: None})

Retrieve answers file


In [23]:
answers_df = pd.read_excel("../data/asc_842/lease_agreements_info/answers.xlsx")

Get questions to send to the AI


In [24]:
question_id = question_ids_map["lessor_name"]
question_df_filtered = questions_df[questions_df["id"] == question_id]
question_as_json = question_df_filtered.to_dict(orient="records")[0]
question_for_ai = construct_single_question_for_ai(question_as_json)

In [25]:
file_name = "lease001"
local_file = f"../data/asc_842/lease_agreements/{file_name}.pdf"
file_key = f"eafit/{file_name}.pdf"
namespace = f"eafit_{file_name}"

In [26]:
model: AvailableModels = "gpt-3.5-turbo"
embedding_model: AvailableEmbeddingModels = "text-embedding-3-large"

In [27]:
result = process_lease(
    local_file=local_file,
    file_name=file_name,
    file_key=file_key,
    namespace=namespace,
    question_for_ai=question_for_ai,
    question_id=question_id,
    answers_df=answers_df,
    model=model,
    embedding_model=embedding_model,
    already_stored=True,
    force_structured_output=True,
    index_name=os.getenv("INDEX_LATEST_NAME", ""),
)

Stored eafit/lease001.pdf in vector database with namespace eafit_lease001


In [28]:
answer = result["answer"]
real_answer_unprocessed = result["real_answer_unprocessed"]

In [29]:
answer

['ABC Leasing Corporation']

In [30]:
similarity_score = fuzz.ratio(
    answer[0].lower(), real_answer_unprocessed.iloc[0].lower()
)

In [31]:
similarity_score >= threshold

False