## Loading CSV files

In [2]:
from langchain.document_loaders.csv_loader import CSVLoader

In [3]:
# loads csv file
def get_data(file_path):
    loader = CSVLoader(file_path)
    data = loader.load()
    return data

In [4]:
data = get_data(file_path='./Data/Bacti Samples Table from dw_regulations_2021_0701_effective_RTCR.csv')
print(type(data))
print("\n")
print(data[0])

# TODOS
# 1. redo? macros for csvs to format in same format as "Bacti Samples Table from dw_regulations_2021_0701_effective_RTCR.csv"
# 2. TEST relevant string data with tabular data using OpenAI querying/completion - use 1.csv and relevant info as test
# 3. store in dict, figure out how to add strings to an updated docx file (replace prev paragraphs? or just add idk)
# 4. reuse this docx file to re-index - can use previous files once docx is in Data folder

<class 'list'>


page_content='\ufeffMonthly Population Served: 25 to 1000\nService Connections: 15 to 400\nMinimum Number of Samples Per Month: 1\n: ' metadata={'source': './Data/Bacti Samples Table from dw_regulations_2021_0701_effective_RTCR.csv', 'row': 0}


## Getting Relevant Text

In [5]:
import docx
from docx import Document
from docx import table
from docx.shared import Inches

In [6]:
# ACTUAL TRAVERSAL

# Open the document
doc = docx.Document('./Data/raw_regulations.docx')

# Create an empty list to hold the paragraphs and tables
content = []

# Iterate through the document's block elements
for block in doc.element.body:
    if isinstance(block, docx.oxml.text.paragraph.CT_P):
        # If the block is a paragraph and has text, append its text to the content list
        if (docx.text.paragraph.Paragraph(block, doc).text != ""):
            content.append(docx.text.paragraph.Paragraph(block, doc))
    elif isinstance(block, docx.oxml.table.CT_Tbl):
        # If the block is a table, append it to the content list
        content.append(docx.table.Table(block, doc))

In [7]:
# RELEVANT CONTENT
# each element of relevant_content array reflects ith table's preceding 3 paragraphs

relevant_content = []
tables = []

def return_relevant_content(i):
    relevant_content = ""
    for j in range(-3, 0):
        if not isinstance(content[i+j], docx.table.Table):
            relevant_content += content[i+j].text
    return relevant_content


for i in range(0, len(content)):
    item = content[i]
    if isinstance(item, docx.table.Table):
        relevant_content.append(return_relevant_content(i))
        tables.append(item)
        # below code is helpful for visualization
        # print("Paragraph:", return_relevant_content(i))
        # print("Table:", item)

In [8]:
# Example
print("This content is relevant to table 1:\n\n" + relevant_content[0])

This content is relevant to table 1:

The type of protection that shall be provided to prevent backflow into the public water supply shall be commensurate with the degree of hazard that exists on the consumer's premises. The type of protective device that may be required (listed in an increasing level of protection) includes: Double check Valve Assembly-(DC), Reduced Pressure Principle Backflow Prevention Device-(RP) and an Air gap Separation-(AG). The water user may choose a higher level of protection than required by the water supplier. The minimum types of backflow protection required to protect the public water supply, at the water user's connection to premises with various degrees of hazard, are given in Table 1. Situations not covered in Table 1 shall be evaluated on a case-by-case basis and the appropriate backflow protection shall be determined by the water supplier or health agency.
TABLE 1TYPE OF BACKFLOW PROTECTION REQUIRED


## Generating Docx Text

In [9]:
# Using default OpenAI Model to index and query
!pip install chromadb
from langchain.llms import OpenAI
from langchain.indexes import VectorstoreIndexCreator
from langchain.callbacks import get_openai_callback
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.text_splitter import CharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.docstore.document import Document
from langchain.prompts import PromptTemplate
from langchain.indexes.vectorstore import VectorstoreIndexCreator

from langchain.chains.question_answering import load_qa_chain
from langchain.llms import OpenAI

import os
os.environ['OPENAI_API_KEY'] = "your key here" # your key here


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.1[0m[39;49m -> [0m[32;49m23.1.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [10]:
chain = load_qa_chain(OpenAI(temperature=0), chain_type="stuff")

# answers question using langchain stuff chain; prints tokens used
def answer_question(query):
    with get_openai_callback() as cb:
        output = chain.run(input_documents=data, question=query)
        # print(output)
        # print(f"Total Tokens: {cb.total_tokens}" + "\n")
        return output

In [11]:
# this generates context
def generate_context(relevant_text):
    query = f"Generate a paragraph describing the meaning of this table, with context being: {relevant_text}"
    return answer_question(query)
    
# this generates text version
def generate_text(file_path):
    with get_openai_callback() as cb:
        d = get_data(file_path)
        query = "generate text version without loss of information."
        output = chain.run(input_documents=d, question=query)
        print(f"Total Tokens: {cb.total_tokens}" + "\n")
        return output

In [12]:
# insert generate_context + generate_text
file = open('./Data/txt_with_table_data.txt', 'w')
curr_table_index = 0

# Iterate through the document's block elements
for block in doc.element.body:
    if isinstance(block, docx.oxml.text.paragraph.CT_P):
        # If the block is a paragraph and has text, write paragraph to txt
        if (docx.text.paragraph.Paragraph(block, doc).text != ""):
            curr_paragraph = docx.text.paragraph.Paragraph(block, doc).text
            file.write(curr_paragraph + "\n")
    elif isinstance(block, docx.oxml.table.CT_Tbl):
        # If the block is a table, add context and table text rep to txt
        try:
            file.write(generate_context(relevant_content[curr_table_index]) + "\n" + generate_text(f"./Data/regulation_csv_comma_sep/{curr_table_index}.csv"))
        except:
            print(f"ERROR AT TABLE {curr_table_index}")
            file.write(f"loss of information for table {curr_table_index}")
        curr_table_index += 1

file.close()
print("finished")

Total Tokens: 1244

Total Tokens: 302

Total Tokens: 219

Total Tokens: 228

Total Tokens: 186

Total Tokens: 324

Total Tokens: 382

Total Tokens: 308

Total Tokens: 254

Total Tokens: 167

Total Tokens: 880

Total Tokens: 210

Total Tokens: 188

Total Tokens: 256

Total Tokens: 156

Total Tokens: 170

Total Tokens: 187

Total Tokens: 188

Total Tokens: 170

Total Tokens: 191

Total Tokens: 219

Total Tokens: 1316

Total Tokens: 292

Total Tokens: 504

Total Tokens: 531

Total Tokens: 515

Total Tokens: 527

Total Tokens: 447

ERROR AT TABLE 28
Total Tokens: 306

Total Tokens: 320

Total Tokens: 2141

ERROR AT TABLE 32
ERROR AT TABLE 33
Total Tokens: 571

Total Tokens: 270

Total Tokens: 753

Total Tokens: 642

Total Tokens: 3318

Total Tokens: 449

Total Tokens: 461

Total Tokens: 769

Total Tokens: 202

Total Tokens: 441

Total Tokens: 1422

Total Tokens: 1528

Total Tokens: 1723

Total Tokens: 1019

Total Tokens: 235

Total Tokens: 589

Total Tokens: 411

Total Tokens: 217

Total T