In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [14]:
import os
import sys

In [15]:
sys.path.append(os.path.abspath(os.path.join("../..")))

In [16]:
import numpy as np
import pandas as pd
from fuzzywuzzy import fuzz

from notebooks.utils.constants import (
    AvailableEmbeddingModels,
    AvailableModels,
    question_ids_map,
)
from notebooks.utils.process_lease import process_lease
from notebooks.utils.questions import (
    construct_single_question_for_ai,
)

## Threshold for Levenshtein distance


In [17]:
threshold = 80

Retrieve questions


In [18]:
root_path = sys.path[-1]
questions_file_path = os.path.join(
    root_path, "data/asc_842/grt_ai_reader_questions_v1.xlsx"
)
answers_file_path = os.path.join(
    root_path, "data/asc_842/lease_agreements_info/answers.xlsx"
)


In [19]:
questions_df = pd.read_excel(questions_file_path)
questions_df = questions_df.drop(["encoding"], axis=1)
questions_df = questions_df.replace({np.nan: None})

Retrieve answers file


In [20]:
answers_df = pd.read_excel(answers_file_path)

Get questions to send to the AI


In [21]:
question_id = question_ids_map["lessor_name"]
question_df_filtered = questions_df[questions_df["id"] == question_id]
question_as_json = question_df_filtered.to_dict(orient="records")[0]
question_for_ai = construct_single_question_for_ai(question_as_json)

In [22]:
file_name = "lease001"
local_file = os.path.join(root_path, f"data/asc_842/lease_agreements/{file_name}.pdf")
file_key = f"eafit/{file_name}.pdf"
namespace = f"eafit_{file_name}"

In [23]:
model: AvailableModels = "gpt-3.5-turbo"
embedding_model: AvailableEmbeddingModels = "text-embedding-3-large"

In [25]:
result = process_lease(
    file_name=file_name,
    namespace=namespace,
    question_for_ai=question_for_ai,
    question_id=question_id,
    answers_df=answers_df,
    model=model,
    embedding_model=embedding_model,
    index_name=os.getenv("INDEX_LATEST_NAME", ""),
)

ValueError: Pinecone index name must be provided in either `index_name` or `PINECONE_INDEX_NAME` environment variable

In [28]:
answer = result["answer"]
real_answer_unprocessed = result["real_answer_unprocessed"]

In [29]:
answer

['ABC Leasing Corporation']

In [30]:
similarity_score = fuzz.ratio(
    answer[0].lower(), real_answer_unprocessed.iloc[0].lower()
)

In [31]:
similarity_score >= threshold

False