In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.firefox.firefox_binary import FirefoxBinary
from selenium.webdriver.firefox.options import Options
import os
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
import re
import urllib.request

# Define the path to the Firefox executable
firefox_path = 'C:\\Program Files\\Mozilla Firefox\\firefox.exe'
wd = 'C:/Users/nick2/Desktop/Capstone TT/Geological-Report-Similarity-Analysis'


# Create a Firefox options object
firefox_options = Options()
firefox_options.binary_location = firefox_path

# Initialize the Firefox webdriver with the specified options
driver = webdriver.Firefox(options=firefox_options)

# Set working directory
os.chdir(wd)

# Set the main directory where all data will be extracted
main_folder_path = os.path.join(os.getcwd(), 'WAMEX_DATA_EXTRACTED')

# Create the directory if it doesn't exist
if not os.path.exists(main_folder_path):
    os.mkdir(main_folder_path)

# Define a function to wait until a file is downloaded
def wait_for_download(download_folder, expected_files=1, timeout=60):
    wait_time = 0
    while len(os.listdir(download_folder)) < expected_files and wait_time < timeout:
        time.sleep(1)  # Wait for 1 second
        wait_time += 1

    if wait_time >= timeout:
        print("Download timed out!")

# Read in Excel File Containing all the links to the reports
df = pd.read_excel("WAMEX Results.xlsx", sheet_name="WAMEX Results")
urls = df['Contents'].tolist()
names = df['ANumber'].tolist()


def download_and_save(df, file_extension, folder_path):
    for index, row in df.iterrows():
        file_name = row['link_text']
        
        # Remove the existing extension if present
        file_name_without_extension = os.path.splitext(file_name)[0]
        
        url = row['link_url']
        
        if not url:
            print(f"File name not found for {file_name}")
            continue

        # Use the file name without extension here
        output_path = os.path.join(folder_path, f"{file_name_without_extension}.{file_extension}")
        
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'}
        
        try:
            req = urllib.request.Request(url, headers=headers)
            with urllib.request.urlopen(req) as response, open(output_path, 'wb') as out_file:
                data = response.read()  # a `bytes` object
                out_file.write(data)
            print(f"File downloaded to {output_path}")
        except Exception as e:
            print(f"Failed to download {file_name}. Error: {e}")


for url, name in zip(urls, names):
    folder_path = os.path.join(main_folder_path, str(name))
    if not os.path.exists(folder_path):
        os.mkdir(folder_path)
    
    driver.get(url)
    time.sleep(15)  # Adjust as necessary
    
    rendered_html = driver.page_source
    soup = BeautifulSoup(rendered_html, 'html.parser')
    
    tr_texts = [tr.get_text() for tr in soup.find_all('tr')]
    sizes = [re.search(r"[0-9.]+ (KB|MB|GB|BYTES)", text.upper()) for text in tr_texts if re.search(r"[0-9.]+ (KB|MB|GB|BYTES)", text.upper()) is not None]
    
    a_elements = soup.find_all('a', href=True)
    links_data = [{'link_text': a.get_text(), 'link_url': a['href']} for a in a_elements if 'http' in a['href']]
    
    links_df = pd.DataFrame(links_data)
    links_df['Size'] = sizes
    links_df['Size'] = links_df['Size'].astype(str)
    links_df['Size'] = links_df['Size'].str.extract(r"'([^']+)'")
    links_df['value'] = links_df['Size'].str.extract(r"(\d+\.\d+)").astype(float)
    links_df['unit'] = links_df['Size'].str.extract(r"([A-Za-z]+)")

    selected_df = links_df[((links_df['unit'].str.upper() == 'KB') | 
                           ((links_df['unit'].str.upper() == 'MB') & (links_df['value'] < 30)) | 
                           (links_df['unit'].str.upper() == 'BYTES'))]

    pdf_urls = selected_df[selected_df['link_text'].str.strip().str.contains(r"\.PDF$", case=False)]
    doc_urls = selected_df[selected_df['link_text'].str.strip().str.contains(r"\.DOCX$", case=False)]
    zip_urls = selected_df[selected_df['link_text'].str.strip().str.contains(r"\.ZIP$", case=False)]

    download_and_save(pdf_urls, 'pdf', folder_path)
    download_and_save(doc_urls, 'docx', folder_path)
    download_and_save(zip_urls, 'zip', folder_path)

# Close the driver
driver.quit()


The geckodriver version (0.32.1) detected in PATH at C:\Users\nick2\Documents\geckodriver-v0.32.1-linux-aarch64.tar\geckodriver-v0.32.1-linux-aarch64\geckodriver.exe might not be compatible with the detected firefox version (118.0.2.8682); currently, geckodriver 0.33.0 is recommended for firefox 118.*, so it is advised to delete the driver in PATH and retry


File downloaded to C:\Users\nick2\Desktop\Capstone TT\Geological-Report-Similarity-Analysis\WAMEX_DATA_EXTRACTED\99584\ A099584_3d_model_16321816.zip
File downloaded to C:\Users\nick2\Desktop\Capstone TT\Geological-Report-Similarity-Analysis\WAMEX_DATA_EXTRACTED\99584\ A099584_drill_16323032.zip
File downloaded to C:\Users\nick2\Desktop\Capstone TT\Geological-Report-Similarity-Analysis\WAMEX_DATA_EXTRACTED\99584\ A099584_geochem_16324484.zip
File downloaded to C:\Users\nick2\Desktop\Capstone TT\Geological-Report-Similarity-Analysis\WAMEX_DATA_EXTRACTED\99584\ A099584_geophysics_16325237.zip
File downloaded to C:\Users\nick2\Desktop\Capstone TT\Geological-Report-Similarity-Analysis\WAMEX_DATA_EXTRACTED\9883\ A9883_a9883_a009883_a9883_15862544_(OCR).pdf
File downloaded to C:\Users\nick2\Desktop\Capstone TT\Geological-Report-Similarity-Analysis\WAMEX_DATA_EXTRACTED\98301\ A98301_a98301_a098301_e15_1314_2013s_15638309_(OCR).pdf
File downloaded to C:\Users\nick2\Desktop\Capstone TT\Geologic

KeyboardInterrupt: 