# :)

In [1]:
import requests
from bs4 import BeautifulSoup
import urllib.parse
import os

def get_links_from_url(url, base_url, visited, current_depth):
    """
    Function to retrieve all 'href' links from a given URL, filtering out query parameters, parent directory links,
    and ensuring we only navigate to unvisited subdirectories that go deeper in the hierarchy.
    """
    response = requests.get(url, auth=(username, password))  # Add auth if needed
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        links = []
        
        # Find all anchor tags with href attributes
        for link in soup.find_all('a', href=True):
            href = link['href']
            
            # Filter out links that move up a level (like "../") or are query parameters
            if not href.startswith('?') and href.endswith('/') and not href.startswith('../'):
                full_url = urllib.parse.urljoin(url, href)
                
                # Ensure we're only navigating to deeper directories (by checking path depth)
                if full_url.startswith(base_url) and full_url != url and full_url not in visited:
                    # Check if the new URL is deeper in the directory structure
                    new_depth = full_url.count('/')
                    if new_depth > current_depth:  # Only add if it is deeper
                        links.append(full_url)
        
        return links
    else:
        print(f"Failed to access {url}. Status code: {response.status_code}")
        return []

def download_csv_files(url, base_url, download_dir, username=None, password=None):
    """
    Function to download all EDF files from a given URL and save them to a specified directory.
    """
    response = requests.get(url, auth=(username, password))  # Add auth if needed
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Create the directory if it doesn't exist
        if not os.path.exists(download_dir):
            os.makedirs(download_dir)
        
        # Find all anchor tags with href attributes pointing to CSV files
        for link in soup.find_all('a', href=True):
            href = link['href']
            
            # If it's an EDF file, download it
            if href.endswith('.edf'):
                file_url = urllib.parse.urljoin(url, href)
                file_name = os.path.basename(file_url)
                file_path = os.path.join(download_dir, file_name)
                
                # Download the EDF file
                edf_response = requests.get(file_url, auth=(username, password))
                if edf_response.status_code == 200:
                    with open(file_path, 'wb') as file:
                        file.write(edf_response.content)
                    print(f"Downloaded: {file_path}")
                else:
                    print(f"Failed to download {file_name}")
    else:
        print(f"Failed to access {url}. Status code: {response.status_code}")

def find_all_third_level_and_download_csvs(base_url, download_dir, username=None, password=None):
    """
    Function to find all third-level directories under all second-level directories under all first-level directories,
    and download all EDF files from these third-level directories to a specified folder.
    """
    visited = set()  # To track visited URLs
    current_depth = base_url.count('/')  # Track the current depth level
    print(f"Base URL: {base_url}")
    
    # Get first-level links
    first_level_links = get_links_from_url(base_url, base_url, visited, current_depth)
    for first_link in first_level_links:
        visited.add(first_link)
        current_depth = first_link.count('/')  # Update depth
        print(f"First-level URL: {first_link}")
        
        # Get second-level links
        second_level_links = get_links_from_url(first_link, base_url, visited, current_depth)
        for second_link in second_level_links:
            visited.add(second_link)
            current_depth = second_link.count('/')  # Update depth
            print(f"Second-level URL: {second_link}")
            
            # Get third-level links and download CSV files
            third_level_links = get_links_from_url(second_link, base_url, visited, current_depth)
            for third_link in third_level_links:
                visited.add(third_link)
                current_depth = third_link.count('/')  # Update depth
                print(f"Third-level URL: {third_link}")
                
                # Now download all EDF files from this third-level directory to the specified folder
                download_csv_files(third_link, base_url, download_dir, username, password)

# # Example usage:
# base_url =  
# download_dir =
# # Authentication credentials
# username = 
# password 

find_all_third_level_and_download_csvs(base_url, download_dir, username, password)


In [None]:
# 