In [None]:
####################################################################################################
## CORE IMPORTS

import datetime as dt
import json
import os
import sys
from typing import Annotated, List, TypedDict
from uuid import uuid4
import operator

import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import pyspark
from dotenv import find_dotenv, load_dotenv

# EH Custom
# import libs.ai as ail
from configs.agents import AGENTS
from configs.models import MODELS
from configs.prompts import PROMPTS
import libs.agent_tools as atools

sys.dont_write_bytecode = True
pd.set_option("mode.copy_on_write", True)
os.chdir("dawgpyl/code")
print(os.getcwd())
_ = load_dotenv(find_dotenv())

In [None]:
####################################################################################################
## Imports for Development

### Web Retrieval
import requests
from bs4 import BeautifulSoup

### Data Storage
import chromadb
# from chromadb import Client as VectorDBClient


## LLML
### LangChain I/O
from langgraph.checkpoint.sqlite import SqliteSaver
from langchain_community.vectorstores import Chroma
from langchain_community.document_loaders import (
    PDFMinerLoader,
    PDFPlumberLoader,
    PyMuPDFLoader,
    TextLoader,
)
from langchain_text_splitters import CharacterTextSplitter
from langchain_community.embeddings.sentence_transformer import SentenceTransformerEmbeddings


### LangChain Inference
from langchain_core.messages import AIMessage, AnyMessage, ChatMessage, HumanMessage, SystemMessage
# from langchain_core.prompts import PromptTemplate

### Agent Orchestration
from langgraph.graph import END, MessageGraph, StateGraph

### Model APIs
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from openai import OpenAI
# from langchain_openai import ChatOpenAI
# from langchain_openai import OpenAI


Testing PDF parsing

In [None]:
####################################################################################################
### PDF Parsing
import PyPDF2
from tabula.io import convert_into, read_pdf


# Choose pdf file
pdf_dir = "data/pdfs"
pdf_files = os.listdir(pdf_dir)
pdf_file = f"{pdf_dir}{pdf_files[3]}"
pdf_reader = PyPDF2.PdfReader(pdf_file)
pdf_len = len(pdf_reader.pages)
print(f"{pdf_file = }")


# Parse pdf file

pdf_table_index = []
for page_num in range(1, pdf_len + 1):
    tables = read_pdf(pdf_file, pages=str(page_num))
    for idx, table in enumerate(tables):
        nu_dict = {}
        nu_dict = {
            "page_num": int(page_num),
            "table_num": int(idx),
            # "table_dataframe":table,
            "table_dict": table.to_dict(),
        }

        pdf_table_index.append(nu_dict)
    del tables

_ = [print(x) for x in pdf_table_index]

# print(pdf_table_index[0])
# output = convert_into(pdf_file, "output.csv", output_format="csv", pages='all')

# loader = PDFMinerLoader(pdf_file)
# # loader = PyMuPDFLoader(pdf_file)
# # loader = PDFPlumberLoader(pdf_file)

# data = loader.load()
# print(data)

In [None]:

import os

from libs.env import *

from libs.agents import Agent, AgentConfig, create_agent, invoke_agent
from libs.base import Directories
from libs.graphs import run_team_workflow
from libs.io import read_text, write_text,read_file,write_file
# from libs.apis import APIS

import chromadb
# from chromadb import Client as VectorDBClient
from langchain_community.vectorstores import Chroma

from langchain_community.document_loaders import TextLoader,PyMuPDFLoader
from langchain_community.embeddings.sentence_transformer import (
    SentenceTransformerEmbeddings,
)
from langchain_text_splitters import CharacterTextSplitter

import pymupdf
import fitz 
from time import sleep


dirs = Directories()

log_file_path = os.path.join(dirs.logs,'log.py')

# Need a BIG data repository...
dir_filestore = "data/pdfs"
dir_vector_db = "data/databases"



# print_heading("Available APIs and Models",'green')
# eprint(MODELS)
# print("\n")

print_heading("Available Agent Personas",'green')
eprint([x for x in list(AGENTS.keys())])
print("\n")

print_heading("Project Directories",'green')
print_dict(dirs.__dict__,'green')

# forked from urobot
import re

import camelot
import fitz
import pandas as pd
from pdfminer.converter import PDFPageAggregator
from pdfminer.high_level import extract_pages, extract_text
from pdfminer.layout import LAParams, LTTextBox, LTTextBoxHorizontal, LTTextContainer
from pdfminer.pdfinterp import PDFPageInterpreter, PDFResourceManager
from pdfminer.pdfpage import PDFPage


def aggregate_table_to_text(series, separator="; "):
    aggregated_text = separator.join(series.astype(str))

    return aggregated_text


def prepare_all_text_data(df, separator=" | ", clean=True):
    combined_text = df.apply(lambda row: separator.join(row.values.astype(str)), axis=1)

    if clean:
        # Define a cleaning function
        def clean_text(text):
            text = text.lower()  # Lowercase text
            text = re.sub(r"<.*?>+", "", text)  # Remove HTML tags
            text = re.sub(r"[\r|\n|\r\n]+", " ", text)  # Remove line breaks
            text = re.sub(r"[\W_]+", " ", text)  # Remove punctuation
            text = re.sub(
                r"\s+", " ", text
            )  # Replace multiple spaces with a single space
            return text.strip()

        # Apply cleaning function
        cleaned_text = combined_text.apply(clean_text)
    else:
        cleaned_text = combined_text

    # Handle missing values
    cleaned_text = cleaned_text.fillna("")

    # Add the prepared text to the DataFrame
    df["prepared_text"] = cleaned_text

    df = df["prepared_text"]

    return df


def extract_text_with_page_numbers(pdf_path, config):
    pages = None
    if config["filter_toc_refs"]:
        start_page, end_page = get_relevant_pages(pdf_path)
        pages = list(range(start_page, end_page + 1))

    filtered = []
    pages_per_chunk = []  # List to store lists of page numbers for each text chunk
    temp_string = ""  # Temporary string to accumulate text
    temp_pages = []  # Temporary list to track pages for the current text chunk
    current_page = 1  # Start from the first page

    for page_layout in extract_pages(pdf_path):
        if page_layout.pageid in pages or pages is None:
            for element in page_layout:
                if isinstance(element, LTTextContainer):
                    text = element.get_text()
                    splitted = text.split("\n\n")  # Split text into paragraphs

                    for string in splitted:
                        cleaned_string = " ".join(string.replace("\n", " ").split())
                        if (
                            re.compile(r"\bREFERENCES\b").search(cleaned_string)
                            and pages is not None
                        ):
                            return filtered, pages_per_chunk

                        if (
                            cleaned_string
                        ):  # Check if there is actual content after cleaning
                            if current_page not in temp_pages:
                                temp_pages.append(
                                    current_page
                                )  # Add the current page number if not already included

                            temp_string += (
                                cleaned_string + " "
                            )  # Accumulate cleaned text

                            # Check conditions to finalize the current chunk
                            if len(temp_string) > config["chunk_threshold"] and (
                                temp_string.endswith(". ")
                                or temp_string.endswith("? ")
                                or temp_string.endswith("! ")
                            ):
                                filtered.append(temp_string.strip())
                                pages_per_chunk.append(temp_pages.copy())
                                temp_string = ""  # Reset temporary string
                                temp_pages = (
                                    []
                                )  # Reset page tracking for the next chunk

        current_page += 1  # Move to the next page

    # Handle any remaining text chunk after the last page
    if temp_string.strip():
        filtered.append(temp_string.strip())
        pages_per_chunk.append(temp_pages)

    return filtered, pages_per_chunk


def extract_by_char_limit(pdf_path, threshold=500):
    text = extract_text(pdf_path)
    splitted = text.split("\n\n")  # Initial split by double new lines to get paragraphs

    filtered = []
    temp_string = ""  # Temporary string to accumulate text

    for string in splitted:
        # Remove consecutive new lines within a paragraph, and trim multiple spaces
        cleaned_string = " ".join(string.replace("\n", " ").split())

        # Proceed with accumulation and checking against the threshold
        temp_string += cleaned_string + " "  # Add space for separation

        # Check if the accumulated string meets criteria to be added to filtered
        if len(temp_string) > threshold and (
            temp_string.endswith(". ")
            or temp_string.endswith("? ")
            or temp_string.endswith("! ")
        ):
            filtered.append(
                temp_string.strip()
            )  # Append to filtered and remove trailing space
            temp_string = ""  # Reset temporary string
        elif len(cleaned_string) > threshold:
            # If there's significant content in temp_string, add it first
            if len(temp_string.strip()) > len(cleaned_string):
                filtered.append(temp_string.strip())
                temp_string = ""  # Reset for next accumulation
            else:
                # If temp_string was mostly the current string, start fresh
                temp_string = (
                    cleaned_string + " "
                )  # Start accumulation afresh with current string

    # Make sure to add any remaining accumulated text
    if temp_string.strip():
        filtered.append(temp_string.strip())

    return filtered


def extract_by_paragraphs(pdf_path):
    paragraphs = []
    current_paragraph = ""
    last_y0 = None

    # Set up the PDF page aggregator
    laparams = LAParams()
    resource_manager = PDFResourceManager()
    device = PDFPageAggregator(resource_manager, laparams=laparams)
    interpreter = PDFPageInterpreter(resource_manager, device)

    for page in PDFPage.get_pages(open(pdf_path, "rb")):
        interpreter.process_page(page)
        layout = device.get_result()
        for element in layout:
            if isinstance(element, LTTextBox):
                for text_line in element:
                    # Check the y0 position to determine if this line is part of a new paragraph
                    if last_y0 is not None and (last_y0 - text_line.y0) > 100:
                        # Consider as new paragraph if the gap is big enough
                        paragraphs.append(current_paragraph.strip())
                        current_paragraph = text_line.get_text()
                    else:
                        current_paragraph += " " + text_line.get_text()
                    last_y0 = text_line.y0
    if current_paragraph.strip() != "":
        paragraphs.append(current_paragraph.strip())

    return paragraphs


def extract_tables(pdf_path):
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="lattice")

    tables_list = []
    table_pages = []

    # Iterate through tables and print them
    for i, table in enumerate(tables, start=1):
        tables_list.append(table.df)
        table_pages.append(table.page)

    return tables_list, table_pages


def find_captions_with_locations(pdf_path):
    captions = []
    potential_blocks = []

    # Step 1: Broadly identify potential caption blocks
    for page_layout in extract_pages(pdf_path, laparams=LAParams()):
        for element in page_layout:
            if isinstance(element, LTTextBoxHorizontal):
                text = element.get_text()
                # Look for the presence of key phrases, numbering patterns, or table captions including the new generalized pattern
                if (
                    re.search(
                        r"\d+(\.\d+)*\s+Summary of evidence and", text, re.IGNORECASE
                    )
                    or "Summary of evidence and" in text
                    or re.search(r"\d+\.\d+\.\d+(\.\d+)?", text)
                    or re.match(r"Table\s+\d+\.\d+", text)
                    or re.match(r"Table\s+\d+:", text)
                ):
                    potential_blocks.append((text, element.y0, page_layout.pageid))

    # Step 2: Refine and extract captions from potential blocks
    for block, y0, pageid in potential_blocks:
        # Split the block into lines for more granular processing
        lines = block.split("\n")
        for line in lines:
            # Check each line for the target patterns, including the new generalized pattern
            if (
                re.search(r"\d+(\.\d+)*\s+Summary of evidence and", line, re.IGNORECASE)
                or "Summary of evidence and guidelines" in line
                or re.match(r"Table\s+\d+\.\d+", line)
                or re.match(r"Table\s+\d+:", line)
            ):
                caption = line.strip()  # Clean up the line to serve as the caption
                captions.append((caption, y0, pageid))
                break  # Assuming one primary caption per block; adjust if needed

    return captions


def associate_captions_with_tables(captions, tables):
    caption_table_pairs = []

    for table in tables:
        page_number = table.page
        table_top = table._bbox[3]  # Top coordinate of the table
        page_captions = {}

        i = 0
        for cap in captions:
            if cap[2] == page_number:
                page_captions.update({i: {"cap": cap[0], "dist": cap[1]}})
                i += 1
            elif cap[2] == page_number - 1:
                page_captions.update({i: {"cap": cap[0], "dist": cap[1] + 420}})
                i += 1

        closest_caption = None
        min_distance = float("inf")

        for caption in page_captions:
            distance = abs(page_captions[caption]["dist"] - table_top)
            if distance < min_distance:
                closest_caption = page_captions[caption]
                min_distance = distance

        if closest_caption:
            caption_table_pairs.append((closest_caption["cap"], table.df))

    return caption_table_pairs


def merge_dataframes_with_same_caption(list_of_tuples):
    merged_dict = {}
    # Iterate through the list of tuples
    for caption, df in list_of_tuples:
        # If the caption is already in the dictionary, concatenate the current dataframe with the existing one
        if caption in merged_dict:
            merged_dict[caption] = pd.concat(
                [merged_dict[caption], df], ignore_index=True
            )
        else:
            merged_dict[caption] = df

    # Convert the dictionary back to a list of tuples
    merged_list_of_tuples = [(caption, df) for caption, df in merged_dict.items()]

    return merged_list_of_tuples


def add_captions_as_rows(list_of_tuples):
    result_list = []

    for caption, df in list_of_tuples:
        # Create a new dataframe with the caption row
        if caption is not None:
            caption_df = pd.DataFrame([caption], columns=[df.columns[0]])
            # Fill remaining columns with empty strings
            for col in df.columns[1:]:
                caption_df[col] = ""

            # Concatenate the caption dataframe with the original dataframe
            # Reset index to avoid index duplication
            new_df = pd.concat([caption_df, df], ignore_index=True)
            result_list.append(new_df)
        else:
            result_list.append(df)

    return result_list


def find_nearest_caption(page, table_top, last_caption):
    pattern = r"\d+(\.\d+)+\s+[A-Za-z]+.*"
    min_distance = float("inf")
    nearest_caption = ""

    for block in page.get_text("blocks"):
        block_text = block[4].strip()
        if re.match(pattern, block_text):
            block_bottom = block[3]
            distance = table_top - block_bottom
            if 0 < distance < min_distance:
                min_distance = distance
                nearest_caption = block_text

    return nearest_caption if nearest_caption else last_caption


def extract_and_filter_tables_with_captions(
    pdf_path, tables, headings=["Summary of evidence", "Recommendations"]
):
    doc = fitz.open(pdf_path)
    filtered_tables_with_captions = []
    last_caption = None  # Initialize last_caption as None

    for table in tables:
        if any(heading in table.df.iloc[0, 0] for heading in headings):
            page_num = table.page - 1
            page = doc.load_page(page_num)
            table_top_edge = table._bbox[1]

            caption = find_nearest_caption(page, table_top_edge, last_caption)
            last_caption = (
                caption  # Update last_caption with the current caption for future use
            )

            filtered_tables_with_captions.append((caption, table.df))

    return filtered_tables_with_captions


def dataframe_to_markdown(df):
    # Check if the first row can be used as the header or if it's a description
    if df.shape[1] > 1:
        if (pd.isna(df.iloc[0, 1]) or df.iloc[0, 1] == "") and df.iloc[0, 0]:
            description = df.iloc[
                0, 0
            ].strip()  # Store the description, removing any leading/trailing whitespace
            df = df.drop(0).reset_index(drop=True)  # Remove the description row
        else:
            description = None
    else:
        description = df.iloc[
            0
        ]  # Store the description, removing any leading/trailing whitespace
        df = df.drop(0).reset_index(drop=True)  # Remove the description row

    # Set the first row with entries as headers
    df.columns = df.iloc[0]
    df = df.drop(0).reset_index(drop=True)

    # Convert DataFrame to Markdown
    markdown_table = df.to_markdown(index=False)

    # Prepend description if it exists
    if description is not None:
        markdown_table = f"{description}\n\n{markdown_table}"

    return markdown_table


def get_relevant_pages(pdf_path):
    # Refined regex patterns for specific patterns
    intro_pattern = re.compile(r"\sINTRODUCTION\b")
    refs_pattern = re.compile(r"\sREFERENCES\b")

    intro_count = 0
    refs_count = 0
    start_page = None
    end_page = None

    # Initialize variables to store text of each page
    text_pages = extract_text(pdf_path).split("\f")

    # Iterate through each page's text
    for i, page_text in enumerate(text_pages):
        # Check for the second occurrence of the Introduction pattern
        if intro_pattern.search(page_text):
            intro_count += 1
            if intro_count == 2:
                start_page = i + 1  # We use i+1 since indices are zero-based

        # Check for the second occurrence of the References pattern
        if refs_pattern.search(page_text):
            refs_count += 1
            if refs_count == 2:
                end_page = i
                break  # No need to continue if we found both

    # Return the range of pages between Introduction and References
    if start_page is not None and end_page is not None:
        return start_page, end_page + 1
    else:
        raise ValueError("Couldn't find the specified sections twice in the document.")


def extract_tables_and_captions_with_pdfminer(pdf_path, config):
    captions = find_captions_with_locations(pdf_path)
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="lattice")

    caption_table_pairs = associate_captions_with_tables(captions, tables)
    merged = merge_dataframes_with_same_caption(caption_table_pairs)
    captioned_tables = add_captions_as_rows(merged)

    evidence_recommendations_tables = extract_and_filter_tables_with_captions(
        pdf_path, tables
    )
    evidence_recommendations_tables = add_captions_as_rows(
        evidence_recommendations_tables
    )

    cleaned_tables = []
    table_text = []
    dfs = []
    if config["markdown_tables"]:
        evidence_recommendations_tables = [
            (dataframe_to_markdown(df), df) for df in evidence_recommendations_tables
        ]
        captioned_tables_md = [
            (dataframe_to_markdown(df), df) for df in captioned_tables
        ]

        table_text += [t[0] for t in evidence_recommendations_tables]
        dfs += [t[1] for t in evidence_recommendations_tables]

        table_text += [t[0] for t in captioned_tables_md]
        dfs += [t[1] for t in captioned_tables_md]

    else:
        for df in evidence_recommendations_tables:
            cleaned_tables.append(
                prepare_all_text_data(df, separator=config["separator"], clean=False)
            )

        for df in captioned_tables:
            cleaned_tables.append(
                prepare_all_text_data(df, separator=config["separator"], clean=False)
            )
        for table in cleaned_tables:
            table_text.append(aggregate_table_to_text(table))

    return table_text, dfs


In [None]:

def extract_tables(pdf_path):
    tables = camelot.read_pdf(pdf_path, pages="all", flavor="lattice")

    tables_list = []
    table_pages = []

    # Iterate through tables and print them
    for i, table in enumerate(tables, start=1):
        tables_list.append(table.df)
        table_pages.append(table.page)

    return tables_list, table_pages

In [None]:
pdf_path = 'data/pdfs/table_00.pdf'

table_list,table_pages = extract_tables(pdf_path)
print(f"{table_list = }")
print(f"{table_pages = }")


In [None]:

document_filepath = knowledge_dir_files[1]
print(f"{document_filepath = }")

if '.pdf' in document_filepath:
    document_loader = PyMuPDFLoader(document_filepath)
    pages = document_loader.load()

pages


In [None]:

import pymupdf
import fitz 
from time import sleep

document_filepath = knowledge_dir_files[1]
print(f"{document_filepath = }")

if '.pdf' in document_filepath:
    document_loader = PyMuPDFLoader(document_filepath)
    pages = document_loader.load()

pages


from IPython.display import Markdown

if not hasattr(fitz.Page, "find_tables"):
    raise RuntimeError("This PyMuPDF version does not support the table feature")

for document_filepath in knowledge_dir_files:
    doc = fitz.open(document_filepath)
    # page = doc[14]

    for idx,page in enumerate(doc):
        tabs = page.find_tables()  # detect the tables    
        md_text = None
        for i,tab in enumerate(tabs):  # iterate over all tables
            print(f"file: {document_filepath} \npage: {idx} \ntable:{i}")
            md_text = tab.to_markdown()
            Markdown(md_text)
            sleep(1)

            # cur_df = tab.to_pandas()
            # display(cur_df)
            # for cell in tab.header.cells:
            #     page.draw_rect(cell,color=fitz.pdfcolor["red"],width=0.3)
            # page.draw_rect(tab.bbox,color=fitz.pdfcolor["green"])
            # print(f"Table {i} column names: \n{tab.header.names}, \nexternal: {tab.header.external}")
        
        # show_image(page, f"Table & Header BBoxes")

tab.to_pandas()


In [None]:
from IPython.display import Markdown

if not hasattr(fitz.Page, "find_tables"):
    raise RuntimeError("This PyMuPDF version does not support the table feature")

for document_filepath in knowledge_dir_files:
    doc = fitz.open(document_filepath)
    # page = doc[14]

    for idx,page in enumerate(doc):
        tabs = page.find_tables()  # detect the tables    
        md_text = None
        for i,tab in enumerate(tabs):  # iterate over all tables
            print(f"file: {document_filepath} \npage: {idx} \ntable:{i}")
            md_text = tab.to_markdown()
            Markdown(md_text)
            sleep(1)

            # cur_df = tab.to_pandas()
            # display(cur_df)
            # for cell in tab.header.cells:
            #     page.draw_rect(cell,color=fitz.pdfcolor["red"],width=0.3)
            # page.draw_rect(tab.bbox,color=fitz.pdfcolor["green"])
            # print(f"Table {i} column names: \n{tab.header.names}, \nexternal: {tab.header.external}")
        
        # show_image(page, f"Table & Header BBoxes")

tab.to_pandas()


In [None]:
from unstructured.partition.pdf import partition_pdf

fname = 'code/pdfs/partition_example.pdf'

elements = partition_pdf(filename=fname,
                         infer_table_structure=True,
                         strategy='hi_res',
           )


tables = [el for el in elements if el.category == "Table"]

print(tables[0].text)
print(tables[0].metadata.text_as_html)

### Conclusion
- The existing method does a good job of extracting the text from the pdf document  
- It seems that some of the tabular details are lost


Testing SQL

In [None]:
####################################################################################################
### Testing SQLite

sqlite_path = "M:/Code/ML/ELH/data/databases/sql/agent_db.sqlite"

### Create sqlite3 database
sqlite_db = ail.create_sqlite_connection(sqlite_path)

### Create users table
ail.execute_query(
    sqlite_db,
    (
        """CREATE TABLE IF NOT EXISTS users (
        id INTEGER PRIMARY KEY AUTOINCREMENT,
        name TEXT NOT NULL,
        age INTEGER,
        gender TEXT,
        nationality TEXT
        )
        """
    ),

)

# Load table
table_name = 'users'
users_table = pd.read_sql(f"select * from {table_name}", sqlite_db)
users_table

# Add record
# cols = ["name","age","gender","nationality"]
# vals = ["Me","35","male","american"]
# cur_query = f"INSERT INTO users {cols} VALUES {vals}"

### EH CUSTOM
# # ail.execute_query(sqlite_db, cur_query)

### PANDAS
# pd.read_sql_query(cur_query, sqlite_db)

### PYSPARK
# pyspark.sql(ail.execute_query(sqlite_db,"SELECT * FROM users")

# **Image Generation**

In [None]:
prompt = """
    I am creating a logo for XYZ.    
    """

client = OpenAI(api_key=api_key)
response = client.images.generate(
    model="dall-e-3",
    prompt=prompt,
    size="1024x1024",
    n=1,
    quality="standard",
)

image_url = response.data[0].url
image_url

# **TOOLS**

In [None]:
#Demo of Tools
client = OpenAI(api_key=api_key)

tools = [
    {
        "type": "function",
        "function": {
            "name": "get_current_weather",
            "description": "Get the current weather in a given location",
            "parameters": {
                "type": "object",
                "properties": {
                    "location": {
                        "type": "string",
                        "description": "The city and state, e.g. San Francisco, CA",
                    },
                    "unit": {"type": "string", "enum": ["celsius", "fahrenheit"]},
                },
                "required": ["location"],
            },
        },
    }
]

messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What's the weather like in Boston today?"}]

completion = client.chat.completions.create(
    model=model_name, messages=messages, tools=tools, tool_choice="auto"
)

print(completion)

chat_completion = {
    "id": "chatcmpl-123",
    "object": "chat.completion",
    "created": 1677652288,
    "model": "gpt-3.5-turbo-0125",
    "system_fingerprint": "fp_44709d6fcb",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "\n\nHello there, how may I assist you today?",
            },
            "logprobs": None,
            "finish_reason": "stop",
        }
    ],
    "usage": {"prompt_tokens": 9, "completion_tokens": 12, "total_tokens": 21},
}

# **WEB SCRAPING**

In [None]:
question = """Can you give me a brief tutorial of python package langgraph? 
    Please reference the official documentation found at: 
    https://langchain-ai.github.io/langgraph/how-tos/docs/quickstart/
    """

response_template = """
    Question: {question}

    Answer: Let's think step by step. 
    Please wrap your answer in lines that are maximally 100 characters in length.
    """

In [None]:
website_url = [
    "https://python-docx.readthedocs.io/en/latest/user/documents.html",
    "https://python-docx.readthedocs.io/en/latest/user/text.html",
    "https://python-docx.readthedocs.io/en/latest/user/sections.html",
    "https://python-docx.readthedocs.io/en/latest/user/hdrftr.html",
    "https://python-docx.readthedocs.io/en/latest/user/styles-understanding.html",
    "https://python-docx.readthedocs.io/en/latest/user/styles-using.html",
    "https://python-docx.readthedocs.io/en/latest/user/shapes.html",
    "https://python-docx.readthedocs.io/en/latest/api/document.html",
    "https://python-docx.readthedocs.io/en/latest/api/settings.html",
    "https://python-docx.readthedocs.io/en/latest/api/style.html",
    "https://python-docx.readthedocs.io/en/latest/api/settings.html",
    "https://python-docx.readthedocs.io/en/latest/api/style.html",
    "https://python-docx.readthedocs.io/en/latest/api/text.html",
    "https://python-docx.readthedocs.io/en/latest/api/table.html",
    "https://python-docx.readthedocs.io/en/latest/api/section.html",
    "https://python-docx.readthedocs.io/en/latest/api/shape.html",
    "https://python-docx.readthedocs.io/en/latest/api/dml.html",
    "https://python-docx.readthedocs.io/en/latest/api/shared.html",
    "https://python-docx.readthedocs.io/en/latest/api/enum/index.html",
    "https://python-docx.readthedocs.io/en/latest/api/enum/MsoColorType.html",
]


website_text = []

for site in website_url:
    website_text.append(scrape_website(site))

all_text = "".join(website_text)
print(all_text)

with open(
    root_path + "python_docx_documentation_" + cur_date + ".txt", mode="w"
) as file:
    file.write(all_text)

In [None]:
if __name__ == "__main__":
    website_url = (
        "https://example.com"  # Replace with the URL of the website you want to scrape
    )

    # Scrape text from the website
    website_text = scrape_website(website_url)

    if website_text:
        # Generate a response using GPT-3 based on the scraped text
        prompt = f"Read the following text from {website_url}: {website_text}"
        response = generate_response(prompt)

        print("Generated Response:")
        print(response)
    else:
        print("Failed to scrape text from the website.")