In [18]:
#https://www.casey.vic.gov.au/view-planning-applications

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
from datetime import datetime



def get_value_by_label(soup, label_text):
    try:
        row = soup.find('td', text=label_text).find_parent('tr')
        value_td = row.find_all('td')[1]
        return value_td.text.strip()
    except:
        return "Not Found"

def go_to_page(driver, page_number):
    current_page = 1  # Assuming we start on the first page
    
    while current_page != page_number:
        if current_page % 10 == 0 and page_number > current_page:
            # Click the '...' link after every 10 pages to get to the next set of pages
            ellipsis_xpath = '//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]/tbody/tr[17]/td/table/tbody/tr/td[12]/a'
            try:
                driver.find_element(By.XPATH, ellipsis_xpath).click()
                time.sleep(3)  # Wait for the next set of pages to load
                current_page += 1  # Move to the first page of the next set
            except NoSuchElementException:
                print("Failed to find the '...' link for the next set of pages.")
                break
        else:
            if current_page < page_number:
                next_page_number = current_page + 1
                next_page_xpath = f'//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]/tbody/tr[17]/td/table/tbody/tr/td/a[contains(text(), "{next_page_number}")]'
                try:
                    driver.find_element(By.XPATH, next_page_xpath).click()
                    time.sleep(3)  # Wait for the next page to load
                    current_page = next_page_number
                except NoSuchElementException:
                    print(f"Failed to navigate to page {next_page_number}.")
                    break
            elif current_page > page_number:
                # Handle case if you need to navigate backwards (this part is not covered by your initial logic and might need custom implementation)
                print("Navigating backwards is not implemented.")
                break
    
    if current_page == page_number:
        print(f"Successfully navigated to page {page_number}.")
    else:
        print(f"Failed to navigate to the desired page {page_number}.")


print("Setting up the Selenium WebDriver...")
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)

print("Navigating to the initial URL...")
url = "https://www.casey.vic.gov.au/view-planning-applications"
driver.get(url)

# Input date values
date_from_input = "01/06/2021"
date_to_input = "01/12/2021"

print("Clicking the necessary button to get to the search page...")
button_xpath = '//*[@id="block-content"]/article/div/div/div/p[8]/a'
button = wait.until(EC.element_to_be_clickable((By.XPATH, button_xpath)))
button.click()

print("Entering the desired date range...")
date_from_xpath = '//*[@id="ctl00_Content_txtDateFrom_txtText"]'
date_to_xpath = '//*[@id="ctl00_Content_txtDateTo_txtText"]'
date_from = wait.until(EC.element_to_be_clickable((By.XPATH, date_from_xpath)))
date_to = wait.until(EC.element_to_be_clickable((By.XPATH, date_to_xpath)))

date_from.clear()
date_from.send_keys(date_from_input)
date_to.clear()
date_to.send_keys(date_to_input)


print("Clicking the search button...")
search_button_xpath = '//*[@id="ctl00_Content_btnSearch"]'
search_button = wait.until(EC.element_to_be_clickable((By.XPATH, search_button_xpath)))
search_button.click()

print("Starting to scrape data...")
results = []
detailed_results = []
current_page = 1
stored_current_page = 1
current_index = 1


while True:
    try:
        print(f"Scraping data from page {current_page}...")
        wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]')))
        rows = driver.find_elements(By.XPATH, '//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]/tbody/tr')
        
        for row in rows[1:-1]:  # Skipping the header and pagination rows
            cells = row.find_elements(By.TAG_NAME, "td")
            if len(cells) >= 7:  # Ensure there are enough cells
                link = cells[0].find_element(By.TAG_NAME, "a").text if cells[0].find_elements(By.TAG_NAME, "a") else 'No Link'
                date = cells[1].text
                proposal = cells[2].text
                app_type = cells[3].text
                category_description = cells[4].text  # Assuming it's the fifth column
                address = cells[5].text
                status = cells[6].text
                results.append([link, date, proposal, app_type, category_description, address, status])
        
        # Reset the index if it's a new page
        if current_page != stored_current_page:
            current_index = 1
            
             # Scrape detailed data
        for row_index in range(1, len(rows) - 1):  # Adjust for the actual number of rows
            try:
                # Access details of the current row
                details_link_selector = f'#ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView > tbody > tr:nth-child({row_index + 1}) > td:nth-child(1) > a'
                driver.find_element(By.CSS_SELECTOR, details_link_selector).click()
                print(f"Accessing details of item {row_index} on page {current_page}...")
                time.sleep(5)

                soup = BeautifulSoup(driver.page_source, 'html.parser')
                labels = ["Application Number", "Application Type", "Estate Name", 
                          "Proposal Description", "Lodgement Date",
                          "Estimated Value", "Status", "Further Info Requested Date",
                          "Further Info Received Date", "No of Objections", "Decision",
                          "Decision Date", "VCAT Lodged Date", "Public Open Space Exempt",
                          "Correction", "PS Number", "PS Stage Number", "External Referral Exempt",
                          "Public Open Space Exempt", "Titles Office Approval", "Final Outcome",
                          "Final Outcome Date", "Property Address", "Land Description",
                          "Ward", "eTrack Application Details Page", "Proposal", 
                          "Permit Type", "Relationship", "Advertising Commencement", "Advertising Completion", 
                          "Responsible Authority Outcome", "Version Lodged Date", "Permit Ext Start Date", 
                          "Permit Ext End Date", "Refer to TRANSPORT FOR VICTORIA (FORMERLY PTV)",
                          "Refer to MELBOURNE WATER",  
                          "Refer to CFA", "Refer to TRANSPORT FOR VICTORIA (VICROADS)", 
                          "Refer to APA VTS AUSTRALIA", "Refer to VICTRACK", "Refer to MELBOURNE WATER", 
                          "Refer to TRANSPORT FOR VICTORIA (FORMERLY PTV)", "Change Permit Applicant", 
                          "Section 50 Amendment", "Submit Additional Information", 
                          "Withdraw Application", "Respond to / Extend RFI"]
                data = {label: get_value_by_label(soup, label) for label in labels}
                detailed_results.append(data)
                
                print(f"Data from item {row_index} scraped. Returning to the list...")
                
                
                
                 # Use the "Previous" button to return to the main list
                return_button_xpath = '//*[@id="ctl00_Content_btnPrevious"]'
                driver.find_element(By.XPATH, return_button_xpath).click()
                time.sleep(10)  # Wait for the main list to load

#                 # Go back to the previous list and navigate to the correct page if necessary
#                 driver.back()
#                 time.sleep(5)  # Adjust sleep time as needed
                
                # Ensure you are on the correct page after returning from the details view
                if current_page > 1:
                    go_to_page(driver, current_page)  # You might need to adjust this function to work correctly if it's not currently
                    wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]')))
                    time.sleep(2)  # Adjust timing as necessary
                    

            except NoSuchElementException:
                print(f"No more detail items to scrape on page {current_page}.")
                break

        # Handle pagination
        next_page_number = current_page + 1
        if current_page % 10 == 0:
            print(f"Handling special pagination at page {current_page}...")
            ellipsis_xpath = f'//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]/tbody/tr[17]/td/table/tbody/tr/td[{11 if current_page == 10 else 12}]/a'
            try:
                driver.find_element(By.XPATH, ellipsis_xpath).click()
                print(f"Clicked on '...' to navigate to special page after page {current_page}.")
                time.sleep(3)  # Wait for the next set of pages to load
            except NoSuchElementException:
                print("No '...' link found. Exiting loop.")
                break  # Exit loop if '...' link is not found
        else:
            next_page_xpath = f'//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]/tbody/tr[17]/td/table/tbody/tr/td/a[contains(text(), "{next_page_number}")]'
            try:
                driver.find_element(By.XPATH, next_page_xpath).click()
                print(f"Navigating to page {next_page_number}.")
                time.sleep(5)  # Wait for the next page to load
            except NoSuchElementException:
                print(f"No link found for page {next_page_number}. Exiting loop.")
                break  # Exit loop if next page number is not found

        current_page += 1
        stored_current_page = current_page

    except TimeoutException:
        print("Timeout exception occurred. Exiting loop.")
        break  # Exit the loop if the table is not visible

print("Closing the browser...")
driver.quit()        


# Create the first dataframe
df = pd.DataFrame(results, columns=['App Link', 'Lodgement Date', 'Proposal', 'App Type', 'category_description', 'Address', 'Status'])

# Create the second dataframe
df_detailed = pd.DataFrame(detailed_results)

# Concatenate the dataframes side by side
combined_df = pd.concat([df, df_detailed], axis=1)

# Get current datetime and format it as a string: YYYYMMDD_HHMMSS
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Format the output filename with the input dates
date_from_formatted = date_from_input.replace("/", "")
date_to_formatted = date_to_input.replace("/", "")
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'planning_history_{date_from_formatted}-{date_to_formatted}_applications_{timestamp}.csv'
combined_df.to_csv(filename, index=False)

print(f"Data scraping complete. File '{filename}' saved.")


Setting up the Selenium WebDriver...
Navigating to the initial URL...
Clicking the necessary button to get to the search page...
Entering the desired date range...
Clicking the search button...
Starting to scrape data...
Scraping data from page 1...
Accessing details of item 1 on page 1...
Data from item 1 scraped. Returning to the list...
Accessing details of item 2 on page 1...
Data from item 2 scraped. Returning to the list...
Accessing details of item 3 on page 1...
Data from item 3 scraped. Returning to the list...
Accessing details of item 4 on page 1...
Data from item 4 scraped. Returning to the list...
Accessing details of item 5 on page 1...
Data from item 5 scraped. Returning to the list...
Accessing details of item 6 on page 1...
Data from item 6 scraped. Returning to the list...
Accessing details of item 7 on page 1...
Data from item 7 scraped. Returning to the list...
Accessing details of item 8 on page 1...
Data from item 8 scraped. Returning to the list...
Accessing deta

In [None]:
#backup code
#https://www.casey.vic.gov.au/view-planning-applications

import time
import pandas as pd
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from bs4 import BeautifulSoup
from datetime import datetime



def get_value_by_label(soup, label_text):
    try:
        row = soup.find('td', text=label_text).find_parent('tr')
        value_td = row.find_all('td')[1]
        return value_td.text.strip()
    except:
        return "Not Found"

def go_to_page(driver, page_number):
    current_page = 1  # Assuming we start on the first page
    
    while current_page != page_number:
        if current_page % 10 == 0 and page_number > current_page:
            # Click the '...' link after every 10 pages to get to the next set of pages
            ellipsis_xpath = '//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]/tbody/tr[17]/td/table/tbody/tr/td[12]/a'
            try:
                driver.find_element(By.XPATH, ellipsis_xpath).click()
                time.sleep(3)  # Wait for the next set of pages to load
                current_page += 1  # Move to the first page of the next set
            except NoSuchElementException:
                print("Failed to find the '...' link for the next set of pages.")
                break
        else:
            if current_page < page_number:
                next_page_number = current_page + 1
                next_page_xpath = f'//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]/tbody/tr[17]/td/table/tbody/tr/td/a[contains(text(), "{next_page_number}")]'
                try:
                    driver.find_element(By.XPATH, next_page_xpath).click()
                    time.sleep(3)  # Wait for the next page to load
                    current_page = next_page_number
                except NoSuchElementException:
                    print(f"Failed to navigate to page {next_page_number}.")
                    break
            elif current_page > page_number:
                # Handle case if you need to navigate backwards (this part is not covered by your initial logic and might need custom implementation)
                print("Navigating backwards is not implemented.")
                break
    
    if current_page == page_number:
        print(f"Successfully navigated to page {page_number}.")
    else:
        print(f"Failed to navigate to the desired page {page_number}.")


print("Setting up the Selenium WebDriver...")
driver = webdriver.Chrome()
wait = WebDriverWait(driver, 10)

print("Navigating to the initial URL...")
url = "https://www.casey.vic.gov.au/view-planning-applications"
driver.get(url)

# Input date values
date_from_input = "01/01/2021"
date_to_input = "01/06/2021"

print("Clicking the necessary button to get to the search page...")
button_xpath = '//*[@id="block-content"]/article/div/div/div/p[8]/a'
button = wait.until(EC.element_to_be_clickable((By.XPATH, button_xpath)))
button.click()

print("Entering the desired date range...")
date_from_xpath = '//*[@id="ctl00_Content_txtDateFrom_txtText"]'
date_to_xpath = '//*[@id="ctl00_Content_txtDateTo_txtText"]'
date_from = wait.until(EC.element_to_be_clickable((By.XPATH, date_from_xpath)))
date_to = wait.until(EC.element_to_be_clickable((By.XPATH, date_to_xpath)))

date_from.clear()
date_from.send_keys(date_from_input)
date_to.clear()
date_to.send_keys(date_to_input)


print("Clicking the search button...")
search_button_xpath = '//*[@id="ctl00_Content_btnSearch"]'
search_button = wait.until(EC.element_to_be_clickable((By.XPATH, search_button_xpath)))
search_button.click()

print("Starting to scrape data...")
results = []
detailed_results = []
current_page = 1
stored_current_page = 1
current_index = 1


while True:
    try:
        print(f"Scraping data from page {current_page}...")
        wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]')))
        rows = driver.find_elements(By.XPATH, '//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]/tbody/tr')
        
        for row in rows[1:-1]:  # Skipping the header and pagination rows
            cells = row.find_elements(By.TAG_NAME, "td")
            if len(cells) >= 7:  # Ensure there are enough cells
                link = cells[0].find_element(By.TAG_NAME, "a").text if cells[0].find_elements(By.TAG_NAME, "a") else 'No Link'
                date = cells[1].text
                proposal = cells[2].text
                app_type = cells[3].text
                category_description = cells[4].text  # Assuming it's the fifth column
                address = cells[5].text
                status = cells[6].text
                results.append([link, date, proposal, app_type, category_description, address, status])
        
        # Reset the index if it's a new page
        if current_page != stored_current_page:
            current_index = 1
            
             # Scrape detailed data
        for row_index in range(1, len(rows) - 1):  # Adjust for the actual number of rows
            try:
                # Access details of the current row
                details_link_selector = f'#ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView > tbody > tr:nth-child({row_index + 1}) > td:nth-child(1) > a'
                driver.find_element(By.CSS_SELECTOR, details_link_selector).click()
                print(f"Accessing details of item {row_index} on page {current_page}...")
                time.sleep(5)

                soup = BeautifulSoup(driver.page_source, 'html.parser')
                labels = ["Application Number", "Application Type", "Estate Name", 
                          "Proposal Description", "Lodgement Date",
                          "Estimated Value", "Status", "Further Info Requested Date",
                          "Further Info Received Date", "No of Objections", "Decision",
                          "Decision Date", "VCAT Lodged Date", "Public Open Space Exempt",
                          "Correction", "PS Number", "PS Stage Number", "External Referral Exempt",
                          "Public Open Space Exempt", "Titles Office Approval", "Final Outcome",
                          "Final Outcome Date", "Property Address", "Land Description",
                          "Ward", "eTrack Application Details Page", "Proposal", 
                          "Permit Type", "Relationship", "Advertising Commencement", "Advertising Completion", 
                          "Responsible Authority Outcome", "Version Lodged Date", "Permit Ext Start Date", 
                          "Permit Ext End Date", "Refer to TRANSPORT FOR VICTORIA (FORMERLY PTV)",
                          "Refer to MELBOURNE WATER",  
                          "Refer to CFA", "Refer to TRANSPORT FOR VICTORIA (VICROADS)", 
                          "Refer to APA VTS AUSTRALIA", "Refer to VICTRACK", "Refer to MELBOURNE WATER", 
                          "Refer to TRANSPORT FOR VICTORIA (FORMERLY PTV)", "Change Permit Applicant", 
                          "Section 50 Amendment", "Submit Additional Information", 
                          "Withdraw Application", "Respond to / Extend RFI"]
                data = {label: get_value_by_label(soup, label) for label in labels}
                detailed_results.append(data)
                
                print(f"Data from item {row_index} scraped. Returning to the list...")
                
                
                
                 # Use the "Previous" button to return to the main list
                return_button_xpath = '//*[@id="ctl00_Content_btnPrevious"]'
                driver.find_element(By.XPATH, return_button_xpath).click()
                time.sleep(10)  # Wait for the main list to load

#                 # Go back to the previous list and navigate to the correct page if necessary
#                 driver.back()
#                 time.sleep(5)  # Adjust sleep time as needed
                
                # Ensure you are on the correct page after returning from the details view
                if current_page > 1:
                    go_to_page(driver, current_page)  # You might need to adjust this function to work correctly if it's not currently
                    wait.until(EC.visibility_of_element_located((By.XPATH, '//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]')))
                    time.sleep(2)  # Adjust timing as necessary
                    

            except NoSuchElementException:
                print(f"No more detail items to scrape on page {current_page}.")
                break

        # Handle pagination
        next_page_number = current_page + 1
        if current_page % 10 == 0:
            print(f"Handling special pagination at page {current_page}...")
            ellipsis_xpath = f'//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]/tbody/tr[17]/td/table/tbody/tr/td[{11 if current_page == 10 else 12}]/a'
            try:
                driver.find_element(By.XPATH, ellipsis_xpath).click()
                print(f"Clicked on '...' to navigate to special page after page {current_page}.")
                time.sleep(3)  # Wait for the next set of pages to load
            except NoSuchElementException:
                print("No '...' link found. Exiting loop.")
                break  # Exit loop if '...' link is not found
        else:
            next_page_xpath = f'//*[@id="ctl00_Content_cusResultsGrid_repWebGrid_ctl00_grdWebGridTabularView"]/tbody/tr[17]/td/table/tbody/tr/td/a[contains(text(), "{next_page_number}")]'
            try:
                driver.find_element(By.XPATH, next_page_xpath).click()
                print(f"Navigating to page {next_page_number}.")
                time.sleep(5)  # Wait for the next page to load
            except NoSuchElementException:
                print(f"No link found for page {next_page_number}. Exiting loop.")
                break  # Exit loop if next page number is not found

        current_page += 1
        stored_current_page = current_page

    except TimeoutException:
        print("Timeout exception occurred. Exiting loop.")
        break  # Exit the loop if the table is not visible

print("Closing the browser...")
driver.quit()        


# Create the first dataframe
df = pd.DataFrame(results, columns=['App Link', 'Lodgement Date', 'Proposal', 'App Type', 'category_description', 'Address', 'Status'])

# Create the second dataframe
df_detailed = pd.DataFrame(detailed_results)

# Concatenate the dataframes side by side
combined_df = pd.concat([df, df_detailed], axis=1)

# Get current datetime and format it as a string: YYYYMMDD_HHMMSS
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

# Format the output filename with the input dates
date_from_formatted = date_from_input.replace("/", "")
date_to_formatted = date_to_input.replace("/", "")
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
filename = f'planning_history_{date_from_formatted}-{date_to_formatted}_applications_{timestamp}.csv'
combined_df.to_csv(filename, index=False)

print(f"Data scraping complete. File '{filename}' saved.")
