In [2]:
import os
from dotenv import load_dotenv, find_dotenv
from pprint import pprint, pformat
import fitz  # Import the PyMuPDF library
import csv
import pandas as pd
from langchain import hub
from langchain_openai  import ChatOpenAI, OpenAIEmbeddings
from langchain.schema import AIMessage, HumanMessage, SystemMessage, Document
from langchain.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

import tiktoken

from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent, create_pandas_dataframe_agent
from langchain_openai import ChatOpenAI, OpenAI

from nanonets import NANONETSOCR


load_dotenv(find_dotenv())

True

### Global variables

In [3]:
input_file = r"N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub.pdf"
# Extracted tables:
extracted_csvs_path = os.path.join(os.path.dirname(input_file), os.path.splitext(os.path.basename(input_file))[0] + " extracted_tables")
os.makedirs(extracted_csvs_path, exist_ok=True)
nano_extracted_tables_csv_path = os.path.join(extracted_csvs_path, os.path.splitext(os.path.basename(input_file))[0] + "_extracted_tables.csv" )

# Nanonets OCR

In [4]:
model = NANONETSOCR()
model.set_token(os.getenv("NANONESTS_TABLE_MODEL_TOKEN"))

In [5]:
model.convert_to_csv(input_file, output_file_name=nano_extracted_tables_csv_path)

# Parse CSV

In [6]:
def extract_tables_to_csv(csv_file_path, output_path):
    # Extract directory and base name without extension
    csv_directory = os.path.dirname(csv_file_path)
    csv_base_name = os.path.basename(csv_file_path)
    csv_name_without_ext = os.path.splitext(csv_base_name)[0]

    # Create output folder path
    output_folder = output_path#os.path.join(csv_directory, csv_name_without_ext)

    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Initialize variables
    current_table_number = 0
    current_table_lines = []
    table_started = False
    table_paths = []  # Initialize an empty list to store file paths

    # Read the CSV file
    with open(csv_file_path, 'r', newline='') as file:
        reader = csv.reader(file)
        for row in reader:
            # Convert row list to string to check if it's a table header
            row_str = ','.join(row)
            if row_str.startswith('TABLE '):
                # If we're already in a table, save the current table
                if table_started:
                    path = save_table(current_table_lines, current_table_number, output_folder)
                    table_paths.append(path)  # Add the path to the list
                    current_table_lines = []  # Reset for the next table
                current_table_number += 1
                table_started = True  # Start a new table
            elif table_started:
                # Add row to current table
                current_table_lines.append(row)

    # Save the last table after file ends
    if table_started:
        save_table(current_table_lines, current_table_number, output_folder)
        table_paths.append(path)  # Add the path to the list
        
    return table_paths

def save_table(table_lines, table_number, output_folder):
    output_file_path = os.path.join(output_folder, f"Table_{table_number}.csv")

    # Check if the first row is empty or contains only commas
    if not any(table_lines[0]) or all(cell == '' for cell in table_lines[0]):
        table_lines = table_lines[1:]  # Remove the first row if it's empty

    with open(output_file_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(table_lines)
    print(f"Table {table_number} saved to {output_file_path}")
    return output_file_path  # Return the path to the saved file

def extract_tables_to_dfs(csv_file_path):
    # Initialize variables
    current_table_number = 0
    current_table_lines = []
    table_started = False
    dfs = []  # Initialize an empty list to store DataFrames

    # Read the CSV file
    with open(csv_file_path, 'r', newline='') as file:
        reader = csv.reader(file)
        for row in reader:
            # Check if it's a table header
            row_str = ','.join(row)
            if row_str.startswith('TABLE '):
                # If we're already in a table, convert the current table to a DataFrame
                if table_started:
                    df = pd.DataFrame(current_table_lines[1:], columns=current_table_lines[0])
                    dfs.append(df)
                    current_table_lines = []  # Reset for the next table
                current_table_number += 1
                table_started = True  # Start a new table
            elif table_started:
                # Add row to current table
                current_table_lines.append(row)

    # Convert the last table to a DataFrame after file ends, if any
    if table_started:
        df = pd.DataFrame(current_table_lines[1:], columns=current_table_lines[0])
        dfs.append(df)

    return dfs

def extract_text_from_pdf(pdf_path):
    doc = fitz.open(pdf_path)
    text = ''
    for page in doc:
        text += page.get_text()
    doc.close()

    return text

In [7]:
# Example usage
# csv_file_path = 'data/pdfs/Yash Paystub Nov 17 2023_extracted.csv'
# csv_file_path = r'N:\Dev\AI\Underwriting\output3.csv'
table_paths = extract_tables_to_csv(nano_extracted_tables_csv_path, extracted_csvs_path)
table_dfs = extract_tables_to_dfs(nano_extracted_tables_csv_path)
table_text = extract_text_from_pdf(input_file)

Table 1 saved to N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub extracted_tables\Table_1.csv
Table 2 saved to N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub extracted_tables\Table_2.csv
Table 3 saved to N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub extracted_tables\Table_3.csv
Table 4 saved to N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub extracted_tables\Table_4.csv
Table 5 saved to N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub extracted_tables\Table_5.csv
Table 6 saved to N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub extracted_tables\Table_6.csv


In [8]:
print(table_paths)

['N:\\Dev\\AI\\Underwriting\\data\\clients\\Safia Seyed\\Safia Seyed Paystub extracted_tables\\Table_1.csv', 'N:\\Dev\\AI\\Underwriting\\data\\clients\\Safia Seyed\\Safia Seyed Paystub extracted_tables\\Table_2.csv', 'N:\\Dev\\AI\\Underwriting\\data\\clients\\Safia Seyed\\Safia Seyed Paystub extracted_tables\\Table_3.csv', 'N:\\Dev\\AI\\Underwriting\\data\\clients\\Safia Seyed\\Safia Seyed Paystub extracted_tables\\Table_4.csv', 'N:\\Dev\\AI\\Underwriting\\data\\clients\\Safia Seyed\\Safia Seyed Paystub extracted_tables\\Table_5.csv', 'N:\\Dev\\AI\\Underwriting\\data\\clients\\Safia Seyed\\Safia Seyed Paystub extracted_tables\\Table_5.csv']


In [9]:
table_dfs[0]

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6
0,,STATEMENT,OF\nBULLETIN DE,EARNINGS\nPAIE,,
1,TYPE,HOURS\nHEURES,RATE\nTAUX,AMOUNT\nMONTANT,Y.T.D.\nA JOUR,
2,SALARY,,,"2.591,88","15.551,28",
3,,,,,,
4,,,,,,


In [10]:
model = "gpt-4-turbo-preview"

dfs_chat_agent = create_pandas_dataframe_agent(
    ChatOpenAI(temperature=0, model=model), 
    table_dfs, 
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
)

csv_chat_agent = create_csv_agent(
    ChatOpenAI(temperature=0, model=model),
    table_paths,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
)

text_chat = ChatOpenAI(temperature=0, model=model)
prompt = f'''
The text below is extracted from financial payslips, answer the question based on the text provided.
Text:
f{table_text}

Question:

'''

In [11]:
question = "What is the base salary and what is the YTD earnings?"

In [12]:
csv_chat_agent.run(question)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'df1.head()'}`


[0m[36;1m[1;3m  Unnamed: 0      STATEMENT OF\nBULLETIN DE   EARNINGS\nPAIE      Unnamed: 4  \
0       TYPE  HOURS\nHEURES      RATE\nTAUX  AMOUNT\nMONTANT  Y.T.D.\nA JOUR   
1     SALARY            NaN             NaN         2.591,88       15.551,28   
2        NaN            NaN             NaN              NaN             NaN   
3        NaN            NaN             NaN              NaN             NaN   

   Unnamed: 5  
0         NaN  
1         NaN  
2         NaN  
3         NaN  [0m[32;1m[1;3mThe base salary is 2,591.88, and the Year-To-Date (YTD) earnings are 15,551.28.[0m

[1m> Finished chain.[0m


'The base salary is 2,591.88, and the Year-To-Date (YTD) earnings are 15,551.28.'

In [13]:
dfs_chat_agent.run(question)



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'df1.head()'}`


[0m[36;1m[1;3m                                                                            
0              STATEMENT  OF\nBULLETIN DE   EARNINGS\nPAIE                  
1    TYPE  HOURS\nHEURES       RATE\nTAUX  AMOUNT\nMONTANT  Y.T.D.\nA JOUR  
2  SALARY                                         2.591,88       15.551,28  
3                                                                           
4                                                                           [0m[32;1m[1;3mThe base salary is 2,591.88, and the Year-To-Date (YTD) earnings are 15,551.28.[0m

[1m> Finished chain.[0m


'The base salary is 2,591.88, and the Year-To-Date (YTD) earnings are 15,551.28.'

In [14]:
text_chat.invoke(prompt + question)

AIMessage(content='The base salary for the period is $2,591.88, and the Year-To-Date (YTD) earnings are $15,551.28.')

In [15]:
text_chat.invoke(prompt + "What is the start and end period of this payslip?")

AIMessage(content='The start and end period of this payslip are from 2023/10/01 to 2023/10/15.')

In [16]:
(dfs_chat_agent.agent.dict())

{'runnable': {'name': None,
  'first': {'name': None,
   'mapper': {'name': None,
    'steps': {'agent_scratchpad': RunnableLambda(lambda x: format_to_openai_function_messages(x['intermediate_steps']))}}},
  'middle': [{'name': None,
    'input_variables': ['agent_scratchpad', 'input'],
    'input_types': {'agent_scratchpad': typing.List[typing.Union[langchain_core.messages.ai.AIMessage, langchain_core.messages.human.HumanMessage, langchain_core.messages.chat.ChatMessage, langchain_core.messages.system.SystemMessage, langchain_core.messages.function.FunctionMessage, langchain_core.messages.tool.ToolMessage]]},
    'output_parser': None,
    'partial_variables': {},
    'metadata': None,
    'tags': None,
    'messages': [{'content': "\nYou are working with 6 pandas dataframes in Python named df1, df2, etc.\nThis is the result of `print(df.head())` for each dataframe:\n|    |        |           |             |          |           |    |\n|---:|:-------|:----------|:------------|:------