In [None]:
pip install beautifulsoup4



In [None]:
import requests
from bs4 import BeautifulSoup

# URL of the law page
url = "https://www.legislation.gov.au/C1953A00095/2024-07-10/2024-07-10/text/original/epub/OEBPS/document_1/document_1.html#_Toc172280070"
# Send a GET request to fetch the webpage
response = requests.get(url)

# Check if the request was successful
if response.status_code == 200:
    # Parse the HTML content of the page
    soup = BeautifulSoup(response.content, 'html.parser')

    # Extract document name
    document_name_tag = soup.find('h2', class_='Title-of-Act')
    if document_name_tag:
        document_name = document_name_tag.get_text(strip=True)
        print(f"Document Name: {document_name}")
    else:
        print("Document name not found")

    # Now let's print out a portion of the HTML to inspect the structure around the sections
    # Printing the first 1000 characters of the content for inspection
    print("\nRaw HTML Sample:\n")
    print(soup.prettify()[1:100000])  # Print first 1000 characters of HTML

else:
    print(f"Failed to retrieve the page. Status code: {response.status_code}")

In [None]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import time

# List of URLs to scrape
urls = pd.read_csv('/content/Austrailia_links.csv', header=None)
urls = urls[0].tolist()

# Initialize a DataFrame to hold the concatenated data
all_data = pd.DataFrame()

# Function to scrape a single URL
def scrape_url(url):
    try:
        response = requests.get(url)
        if response.status_code == 200:
            # Parse the HTML content of the page
            soup = BeautifulSoup(response.content, 'html.parser')

            # Extract document name from <p class="ShortT">
            document_name_tag = soup.find('p', class_='ShortT')
            if document_name_tag:
                document_name = document_name_tag.get_text(strip=True)
            else:
                document_name = "Unknown"

            # As the date is not specified, we'll skip it
            document_date = "Unknown"  # No date available

            # Initialize a list to hold the rows of this URL
            data = []

            # Find all section titles from <p class="ActHead5">
            section_titles = soup.find_all('p', class_='ActHead5')

            current_section_title = None
            current_text = []

            # Loop through each section title and capture the related paragraphs
            for section in section_titles:
                # Extract section title, including nested <b>, <i>, and <span> elements
                section_title = section.get_text(" ", strip=True)

                # If there is already a current section title and text, save it before moving to the next one
                if current_section_title is not None and current_text:
                    row = {
                        'document_name': document_name,
                        'document_date': document_date,
                        'document_section_title': current_section_title,
                        'text': ' '.join(current_text).strip()
                    }
                    data.append(row)

                # Update the section title and reset the text accumulator
                current_section_title = section_title
                current_text = []

                # Find all paragraphs (text) associated with this section
                next_sibling = section.find_next_sibling()
                while next_sibling and (next_sibling.name == 'p' and (
                        'subsection' in next_sibling.get('class', []) or
                        'paragraph' in next_sibling.get('class', []) or
                        'Definition' in next_sibling.get('class', []))):
                    # Add the text from the current paragraph
                    current_text.append(next_sibling.get_text(" ", strip=True))

                    # Move to the next sibling to continue capturing text
                    next_sibling = next_sibling.find_next_sibling()

            # After looping, make sure to save the last section if there is any text left
            if current_section_title is not None and current_text:
                row = {
                    'document_name': document_name,
                    'document_date': document_date,
                    'document_section_title': current_section_title,
                    'text': ' '.join(current_text).strip()
                }
                data.append(row)

            # Convert the scraped data into a DataFrame
            df = pd.DataFrame(data)
            return df

    except Exception as e:
        print(f"Error scraping {url}: {e}")
        return pd.DataFrame()

# Loop through each URL with a delay between requests
for url in urls:
    print(f"Scraping: {url}")
    df = scrape_url(url)

    if not df.empty:
        # Concatenate the new data with the existing DataFrame
        all_data = pd.concat([all_data, df], ignore_index=True)

    # Pause between requests to avoid overwhelming the server
    time.sleep(5)  # Pause for 5 seconds (you can adjust this)

# Save the final concatenated DataFrame to a CSV file
all_data.to_csv('australia_national_laws.csv', index=False)

# Save the final concatenated DataFrame to an Excel file with UTF-8 encoding
all_data.to_excel('australia_national_laws.xlsx', index=False)

# Show the result
print(all_data)


Scraping: https://www.legislation.gov.au/C2022A00037/2023-04-12/2023-04-12/text/original/epub/OEBPS/document_1/document_1.html
Scraping: https://www.legislation.gov.au/C2004A00485/2023-12-15/2023-12-15/text/original/epub/OEBPS/document_1/document_1.html
Scraping: https://www.legislation.gov.au/C2004A05173/2024-05-31/2024-05-31/text/original/epub/OEBPS/document_1/document_1.html
Scraping: https://www.legislation.gov.au/C2023A00121/asmade/2023-12-14/text/original/epub/OEBPS/document_1/document_1.html
Scraping: https://www.legislation.gov.au/C2020A00119/2021-09-01/2021-09-01/text/original/epub/OEBPS/document_1/document_1.html
Scraping: https://www.legislation.gov.au/C2004A00767/2024-03-20/2024-03-20/text/original/epub/OEBPS/document_1/document_1.html
Scraping: https://www.legislation.gov.au/C2007A00137/2024-07-01/2024-07-01/text/original/epub/OEBPS/document_1/document_1.html
Scraping: https://www.legislation.gov.au/C1903A00020/2024-05-06/2024-05-06/text/original/epub/OEBPS/document_1/docu