### Extracting Bill Information from Webpage

This notebook data mines the bill information directly from the Florida Senate Website. I see that the website is structured where there is a specific format for the URLs and the last part of the URL contains the specific bill in question. By providng the URL for the bill, this code is able to extract the relevant info. 

In [1]:
import requests
from bs4 import BeautifulSoup

# URL of the Florida Senate Bill page
url = 'https://www.flsenate.gov/Session/Bill/2024/115/ByCategory/?Tab=BillText'

# Send a GET request to the Florida Senate Bill page
response = requests.get(url)

# Initialize bill_title variable
bill_title = ""

# Check if the request was successful
if response.status_code == 200:
    print("Successfully retrieved the webpage.")
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find the h2 tag that follows the div with id 'prevNextBillNav'
    bill_title_tag = soup.find('div', id='prevNextBillNav').find_next('h2')
    if bill_title_tag:
        bill_title = bill_title_tag.get_text(strip=True)
        print(f'Bill Title: {bill_title}')
    else:
        print("Bill title tag not found. Check the HTML structure and update the selector accordingly.")

    # Extract the description of the bill from the <p> element with class 'width80'
    bill_description_tag = soup.find('p', class_='width80')
    if bill_description_tag:
        bill_description = bill_description_tag.get_text(strip=True)
        print(f'Bill Description: {bill_description}')
    else:
        print("Bill description tag not found. Check the HTML structure and update the selector accordingly.")

    # Find the link to the bill text PDF
    bill_text_link = soup.find('a', text='Bill Text: PDF')
    if bill_text_link:
        bill_text_url = 'https://www.flsenate.gov' + bill_text_link['href']
        print(f'Bill Text URL: {bill_text_url}')
    else:
        print("No bill text link found. Check the selector.")

else:
    print(f'Failed to retrieve the webpage, status code: {response.status_code}')

# You can now use the bill_title variable later in your script as needed.


Successfully retrieved the webpage.
Bill Title: CS/HB 115: Progressive Supranuclear Palsy and Other Neurodegenerative Diseases Policy Workgroup
Bill Description: Progressive Supranuclear Palsy and Other Neurodegenerative Diseases Policy Workgroup;Requires State Surgeon General to establish progressive supranuclear palsy & other neurodegenerative diseases policy workgroup; provides for duties, membership, & meetings of workgroup; requiring State Surgeon General to submit annual reports & final report by specified date to Governor & Legislature.
No bill text link found. Check the selector.


  bill_text_link = soup.find('a', text='Bill Text: PDF')


### Downloading the Bill Information for Analysis

This code below downloads the PDF of the bill and extracts the texual information needed for summarization. 

In [2]:
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# URL of the Florida Senate Bill page
base_url = 'https://www.flsenate.gov'
#bill_page_url = '/Session/Bill/2024/115/ByCategory/?Tab=BillText'
bill_page_url = 'https://www.flsenate.gov/Session/Bill/2023/2'

# Send a GET request to the Florida Senate Bill page
response = requests.get(urljoin(base_url, bill_page_url))

# Check if the request was successful
if response.status_code == 200:
    print("Successfully retrieved the webpage.")
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Extract the bill PDF link from the <a> element with class 'lnk_BillTextPDF'
    bill_pdf_link = soup.find('a', class_='lnk_BillTextPDF')
    
    if bill_pdf_link:
        # Construct the full URL to the bill text PDF
        bill_pdf_url = urljoin(base_url, bill_pdf_link['href'])
        print(f'Bill PDF URL: {bill_pdf_url}')
        
        # Download the bill PDF
        pdf_response = requests.get(bill_pdf_url)
        
        if pdf_response.status_code == 200:
            # Define the local path where you want to save the PDF
            local_filename = "bill_text.pdf"
            with open(local_filename, 'wb') as pdf_file:
                pdf_file.write(pdf_response.content)
            print(f'Successfully downloaded the bill text PDF to {local_filename}.')
        else:
            print(f'Failed to download the PDF, status code: {pdf_response.status_code}')
    else:
        print("The link to the bill text PDF was not found.")
else:
    print(f'Failed to retrieve the webpage, status code: {response.status_code}')


Successfully retrieved the webpage.
Bill PDF URL: https://www.flsenate.gov/Session/Bill/2023/2/BillText/er/PDF
Successfully downloaded the bill text PDF to bill_text.pdf.


In [None]:
### SUPER IMPORTANT YOU USE THIS VERSION OF OPENAI API
# Default version discontinued these features. 

%pip install openai==0.28

In [3]:
import openai
import fitz  # PyMuPDF

# OpenAI API Key - Make sure to keep this secure and do not expose it in your code
openai.api_key = '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'


# Function to summarize text using the OpenAI Chat Completions API
def summarize_with_openai_chat(text, model="gpt-3.5-turbo"):
    response = openai.ChatCompletion.create(
        model=model,
        messages=[
            {"role": "system", "content": "You are going to generate a 1-3 sentence response summarizing each page of a bill passed in the florida senate. You will recieve the raw text of each page."},
            {"role": "user", "content": text}
        ]
    )
    
    content = response['choices'][0]['message']['content']
    return content

# Path to the PDF file
pdf_path = "bill_text.pdf"

# Open the PDF file
with fitz.open(pdf_path) as pdf:
    for page_num in range(len(pdf)):
        # Get a page of the PDF
        page = pdf[page_num]
        # Extract text from the page
        text = page.get_text()
        
        # Get a summary of the page using the OpenAI Chat Completions API
        summary = summarize_with_openai_chat(text)
        print(f"Summary of page {page_num + 1}:")
        print(summary)


Summary of page 1:
Page 1 of SB 2, 1st Engrossed, provides an overview of the bill which aims to provide relief to the Estate of Molly Parker due to her death resulting from the negligence of the Department of Transportation. The bill includes an appropriation to compensate the estate, limits on compensation and attorney fees, and legislative intent regarding the waiver of certain liens. The page also outlines the circumstances of Ms. Parker's death in a car crash involving a dump truck owned by the Department of Transportation.
Summary of page 2:
On this page, the bill highlights the severe injuries and subsequent death of Ms. Parker due to a car crash. It mentions the medical and surgical costs of her treatment, as well as the financial impact of her lost earnings. Additionally, it acknowledges the mental pain and suffering endured by her surviving family members. The page also refers to an internal investigation conducted by the department into the cause of the collision.
Summary of

### This Creates a Function that generates Pros and Cons

In [4]:
import openai
import fitz  # PyMuPDF
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors

def generate_pros_and_cons(summary_text):
    width, height = letter
    # Generate pros
    pros_response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant designed to generate pros for supporting a bill based on its summary. Make it no more than 2 sentences."},
            {"role": "user", "content": f"What are the pros of supporting this bill? make it no more than 2 sentences \n\n{summary_text}"}
        ]
    )
    pros = pros_response['choices'][0]['message']['content']

    # Generate cons
    cons_response = openai.ChatCompletion.create(
        model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant designed to generate cons against supporting a bill based on its summary. make it no more than 2 sentences"},
            {"role": "user", "content": f"What are the cons of supporting this bill? Make it no more than 2 sentences\n\n{summary_text}"}
        ]
    )
    cons = cons_response['choices'][0]['message']['content']

    return pros, cons

### This Generates the bill summary file with the pros and cons table

In [5]:
import openai
import fitz  # PyMuPDF
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Table, TableStyle
from reportlab.lib.styles import getSampleStyleSheet
from reportlab.lib import colors

def create_summary_pdf(input_pdf_path, output_pdf_path, title):
    width, height = letter  # Page size
    styles = getSampleStyleSheet()
    doc = SimpleDocTemplate(output_pdf_path, pagesize=letter)
    story = []
    
    # Instead of adding a Spacer, start by adding the title with less vertical padding
    story.append(Paragraph(title, styles['Title']))
    story.append(Spacer(1, 12))  # Reduce the size of the spacer if needed

    # Open the original PDF
    with fitz.open(input_pdf_path) as pdf:
        cumulative_summary = ""
        for page_num in range(len(pdf)):
            page = pdf[page_num]
            text = page.get_text()
            summary = summarize_with_openai_chat(text)
            cumulative_summary += summary + " "  # Collect all summaries
            story.append(Paragraph(f"<b>Page {page_num + 1} Summary:</b><br/>{summary}", styles['Normal']))
            story.append(Spacer(1, 12))

    # Generate pros and cons after all summaries
    pros, cons = generate_pros_and_cons(cumulative_summary)
    data = [['Cons', 'Pros'],
            [Paragraph(cons, styles['Normal']), Paragraph(pros, styles['Normal'])]]

    # Create the table with proper column widths
    col_widths = [width * 0.45, width * 0.45]
    t = Table(data, colWidths=col_widths)
    t.setStyle(TableStyle([
        ('BACKGROUND', (0, 0), (1, 0), colors.grey),
        ('TEXTCOLOR', (0, 0), (1, 0), colors.whitesmoke),
        ('ALIGN', (0, 0), (-1, -1), 'CENTER'),
        ('VALIGN', (0, 0), (-1, -1), 'TOP'),
        ('INNERGRID', (0, 0), (-1, -1), 0.25, colors.black),
        ('BOX', (0, 0), (-1, -1), 0.25, colors.black),
    ]))
    story.append(t)

    # Build the PDF
    doc.build(story)


# Paths to the input and output PDFs
input_pdf_path = "bill_text.pdf"
output_pdf_path = "bill_summary.pdf"
#bill_title = "Title of the Bill"

# Create the summary PDF
create_summary_pdf(input_pdf_path, output_pdf_path, bill_title)

In [6]:
import fitz  # PyMuPDF

def append_pdf(base_pdf_path, pdf_to_append_path, output_pdf_path):
    # Create a PDF document object
    output_pdf = fitz.open()

    # Open the base pdf
    base_pdf = fitz.open(base_pdf_path)
    # Open the pdf to append
    pdf_to_append = fitz.open(pdf_to_append_path)

    # Append the base pdf pages to the output pdf
    output_pdf.insert_pdf(base_pdf)

    # Append the pdf_to_append pages to the output pdf
    # Updated to remove the 'after' keyword argument
    output_pdf.insert_pdf(pdf_to_append, start_at=len(output_pdf))

    # Save the output pdf
    output_pdf.save(output_pdf_path)
    output_pdf.close()
    base_pdf.close()
    pdf_to_append.close()
    print(f"The file '{output_pdf_path}' has been created with the contents of '{base_pdf_path}' and '{pdf_to_append_path}'")

# File paths
base_pdf_path = 'bill_summary.pdf'
pdf_to_append_path = 'bill_text.pdf'
output_pdf_path = 'combined_bill_document.pdf'

append_pdf(base_pdf_path, pdf_to_append_path, output_pdf_path)

# Append bill_text.pdf to bill_summary.pdf without running the function here.
# Use the function in your environment.
# append_pdf(base_pdf_path, pdf_to_append_path, output_pdf_path)


The file 'combined_bill_document.pdf' has been created with the contents of 'bill_summary.pdf' and 'bill_text.pdf'
