In [21]:
import os
from dotenv import load_dotenv, find_dotenv
from pprint import pprint, pformat
import fitz  # Import the PyMuPDF library

from langchain import hub
from langchain_openai  import ChatOpenAI, OpenAIEmbeddings
from langchain.schema import AIMessage, HumanMessage, SystemMessage, Document
from langchain.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough

import tiktoken

from langchain.agents.agent_types import AgentType
from langchain_experimental.agents.agent_toolkits import create_csv_agent
from langchain_openai import ChatOpenAI, OpenAI

from nanonets import NANONETSOCR


load_dotenv(find_dotenv())

True

### Global variables

In [25]:
input_file = r"N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub.pdf"
# Extracted tables:
output_csvs_path = os.path.join(os.path.dirname(input_file), os.path.splitext(os.path.basename(input_file))[0] + " extracted_tables")
os.makedirs(output_csvs_path, exist_ok=True)
output_file = os.path.join(output_csvs_path, os.path.splitext(os.path.basename(input_file))[0] + "_extracted_tables.csv" )

# Nanonets OCR

In [26]:
model = NANONETSOCR()
model.set_token(os.getenv("NANONESTS_TABLE_MODEL_TOKEN"))

In [27]:
model.convert_to_csv(input_file, output_file_name=output_file)

# Parse CSV

In [30]:
import csv
import os

def extract_tables(csv_file_path, output_csvs_path):
    # Extract directory and base name without extension
    csv_directory = os.path.dirname(csv_file_path)
    csv_base_name = os.path.basename(csv_file_path)
    csv_name_without_ext = os.path.splitext(csv_base_name)[0]

    # Create output folder path
    output_folder = output_csvs_path#os.path.join(csv_directory, csv_name_without_ext)

    # Ensure the output folder exists
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Initialize variables
    current_table_number = 0
    current_table_lines = []
    table_started = False
    table_paths = []  # Initialize an empty list to store file paths

    # Read the CSV file
    with open(csv_file_path, 'r', newline='') as file:
        reader = csv.reader(file)
        for row in reader:
            # Convert row list to string to check if it's a table header
            row_str = ','.join(row)
            if row_str.startswith('TABLE '):
                # If we're already in a table, save the current table
                if table_started:
                    path = save_table(current_table_lines, current_table_number, output_folder)
                    table_paths.append(path)  # Add the path to the list
                    current_table_lines = []  # Reset for the next table
                current_table_number += 1
                table_started = True  # Start a new table
            elif table_started:
                # Add row to current table
                current_table_lines.append(row)

    # Save the last table after file ends
    if table_started:
        save_table(current_table_lines, current_table_number, output_folder)
        table_paths.append(path)  # Add the path to the list
        
    return table_paths

def save_table(table_lines, table_number, output_folder):
    output_file_path = os.path.join(output_folder, f"Table_{table_number}.csv")

    # Check if the first row is empty or contains only commas
    if not any(table_lines[0]) or all(cell == '' for cell in table_lines[0]):
        table_lines = table_lines[1:]  # Remove the first row if it's empty

    with open(output_file_path, 'w', newline='') as file:
        writer = csv.writer(file)
        writer.writerows(table_lines)
    print(f"Table {table_number} saved to {output_file_path}")
    return output_file_path  # Return the path to the saved file



In [31]:
# Example usage
# csv_file_path = 'data/pdfs/Yash Paystub Nov 17 2023_extracted.csv'
# csv_file_path = r'N:\Dev\AI\Underwriting\output3.csv'
table_paths = extract_tables(output_file, output_csvs_path)

Table 1 saved to N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub extracted_tables\Table_1.csv
Table 2 saved to N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub extracted_tables\Table_2.csv
Table 3 saved to N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub extracted_tables\Table_3.csv
Table 4 saved to N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub extracted_tables\Table_4.csv
Table 5 saved to N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub extracted_tables\Table_5.csv
Table 6 saved to N:\Dev\AI\Underwriting\data\clients\Safia Seyed\Safia Seyed Paystub extracted_tables\Table_6.csv


In [32]:
print(table_paths)

['N:\\Dev\\AI\\Underwriting\\data\\clients\\Safia Seyed\\Safia Seyed Paystub extracted_tables\\Table_1.csv', 'N:\\Dev\\AI\\Underwriting\\data\\clients\\Safia Seyed\\Safia Seyed Paystub extracted_tables\\Table_2.csv', 'N:\\Dev\\AI\\Underwriting\\data\\clients\\Safia Seyed\\Safia Seyed Paystub extracted_tables\\Table_3.csv', 'N:\\Dev\\AI\\Underwriting\\data\\clients\\Safia Seyed\\Safia Seyed Paystub extracted_tables\\Table_4.csv', 'N:\\Dev\\AI\\Underwriting\\data\\clients\\Safia Seyed\\Safia Seyed Paystub extracted_tables\\Table_5.csv', 'N:\\Dev\\AI\\Underwriting\\data\\clients\\Safia Seyed\\Safia Seyed Paystub extracted_tables\\Table_5.csv']


In [33]:
# agent = create_csv_agent(
#     OpenAI(temperature=0),
#     "data/data.csv",
#     verbose=True,
#     agent_type=AgentType.ZERO_SHOT_REACT_DESCRIPTION,
# )

# chat_agent = create_csv_agent(
#     ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
#     "data/data.csv",
#     verbose=True,
#     agent_type=AgentType.OPENAI_FUNCTIONS,
# )

multi_chat_agent = create_csv_agent(
    ChatOpenAI(temperature=0, model="gpt-3.5-turbo-0613"),
    table_paths,
    verbose=True,
    agent_type=AgentType.OPENAI_FUNCTIONS,
)

In [34]:
multi_chat_agent.run("What is the base salary?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3m
Invoking: `python_repl_ast` with `{'query': 'df1.iloc[1, 3]'}`


[0m[36;1m[1;3m2.591,88[0m[32;1m[1;3mThe base salary is 2,591.88.[0m

[1m> Finished chain.[0m


'The base salary is 2,591.88.'

In [10]:
multi_chat_agent.run("What is the start and end period of this payslip?")



[1m> Entering new AgentExecutor chain...[0m
[32;1m[1;3mTo determine the start and end period of the payslip, we need to look for the relevant information in the dataframes. Specifically, we need to find the dates or time periods mentioned in the data.

Unfortunately, the provided dataframes do not contain any information related to the start and end period of the payslip. The dataframes mainly consist of columns with labels and some numerical values, but there are no dates or time periods mentioned.

Without any specific information about the start and end period in the data, it is not possible to determine the start and end period of the payslip.[0m

[1m> Finished chain.[0m


'To determine the start and end period of the payslip, we need to look for the relevant information in the dataframes. Specifically, we need to find the dates or time periods mentioned in the data.\n\nUnfortunately, the provided dataframes do not contain any information related to the start and end period of the payslip. The dataframes mainly consist of columns with labels and some numerical values, but there are no dates or time periods mentioned.\n\nWithout any specific information about the start and end period in the data, it is not possible to determine the start and end period of the payslip.'