In [5]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
import requests
from bs4 import BeautifulSoup
import re
import os
from datetime import datetime

class DriveScanner:
    def __init__(self, output_dir='drive_links'):
        # Authenticate and initialize the PyDrive client
        auth.authenticate_user()
        gauth = GoogleAuth()
        gauth.credentials = GoogleCredentials.get_application_default()
        self.drive = GoogleDrive(gauth)
        self.file_count = 0

        # Create output directory if it doesn't exist
        self.output_dir = output_dir
        os.makedirs(self.output_dir, exist_ok=True)

    def should_include_file(self, file_name):
        """
        Determine whether a file or link should be included based on its name.
        Returns True if the file should be included, False otherwise.
        """
        # List of file endings to exclude
        exclude_endings = ['_HC', '_VI', '_AR', '_CH', '_CV', '_FR', '_PO', '_SO', '_SP', '_HA', '-SP', '_SW', '-VI', '-SO', '_RU', '_BE', '_KO', '_Urdu']
        if not any(file_name.endswith((ending + ".pdf", ending + " .pdf")) for ending in exclude_endings):
            self.file_count += 1
            return True
        return False

    def extract_file_id_from_link(self, link):
        """
        Extract file ID from various types of Google Drive links.
        """
        file_id_patterns = [
            r'/file/d/([a-zA-Z0-9_-]+)',  # Pattern for direct file links
            r'/open\?id=([a-zA-Z0-9_-]+)',  # Pattern for open links
        ]

        for pattern in file_id_patterns:
            match = re.search(pattern, link)
            if match:
                return match.group(1)
        return None

    def get_file_name_from_drive(self, file_id):
        """
        Get file name from Google Drive using file ID.
        """
        try:
            file = self.drive.CreateFile({'id': file_id})
            file.FetchMetadata()
            return file['title']
        except Exception as e:
            print(f"Error fetching file metadata: {e}")
            return None

    def list_files_in_folder(self, folder_id):
        """
        List all files in a given Google Drive folder, excluding specified endings.
        """
        try:
            # Query files in the folder
            query = f"'{folder_id}' in parents and trashed=false"
            file_list = self.drive.ListFile({'q': query}).GetList()

            # Filter files
            files = []
            for file in file_list:
                file_name = file['title']
                if self.should_include_file(file_name):  # Check if file should be included
                    file_link = f"{file_name}: https://drive.google.com/file/d/{file['id']}"
                    files.append(file_link)
            return files
        except Exception as e:
            print(f"Error fetching files: {e}")
            return []

    def extract_folder_id(self, folder_url):
        """
        Extract the folder ID from a Google Drive folder URL.
        """
        match = re.search(r'/folders/([a-zA-Z0-9_-]+)', folder_url)
        if match:
            return match.group(1)
        else:
            raise ValueError("Invalid Google Drive folder link. Ensure it contains '/folders/'.")

    def fetch_webpage_content(self, url):
        """
        Fetch the HTML content of a webpage.
        """
        try:
            response = requests.get(url)
            response.raise_for_status()  # Ensure successful response
            return response.text
        except requests.exceptions.RequestException as e:
            print(f"Error fetching the webpage: {e}")
            return None

    def extract_google_drive_links(self, soup):
        """
        Extract all Google Drive links from the given BeautifulSoup object.
        """
        google_drive_links = []
        for link in soup.find_all('a', href=True):
            href = link['href']
            # Check if the link is a Google Drive link
            if re.search(r'https://drive.google.com/', href):
                if 'folders' not in href:  # Handle direct file links
                    file_id = self.extract_file_id_from_link(href)
                    if file_id:
                        file_name = self.get_file_name_from_drive(file_id)
                        if file_name and self.should_include_file(file_name):
                            google_drive_links.append(f"{file_name}: {href}")
                else:  # Handle folder links
                    google_drive_links.append(href)
        return google_drive_links

    def save_links_to_file(self, links):
        """
        Save collected links to a text file with a timestamp
        """
        # Generate filename with timestamp
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        filename = os.path.join(self.output_dir, f"drive_links_{timestamp}.txt")

        try:
            with open(filename, 'w', encoding='utf-8') as f:
                for link in links:
                    f.write(link + '\n')

            print(f"Links saved to {filename}")
            return filename
        except Exception as e:
            print(f"Error saving links to file: {e}")
            return None

    def scan_website(self, url):
        """
        Main method to scan website and process Google Drive links
        """
        # List to collect all links
        all_links = []

        # Fetch the webpage content
        webpage_content = self.fetch_webpage_content(url)
        if webpage_content:
            soup = BeautifulSoup(webpage_content, 'html.parser')
            google_drive_links = self.extract_google_drive_links(soup)

            if google_drive_links:
                print("Google Drive links found:")
                for link in google_drive_links:
                    if "folders" in link:  # Check if the link points to a folder
                        print(f"\nFetching contents of folder: {link}")
                        try:
                            folder_id = self.extract_folder_id(link)
                            file_links = self.list_files_in_folder(folder_id)
                            all_links.extend(file_links)
                            for file_link in file_links:
                                print(f"{file_link}")
                        except ValueError as e:
                            print(f"Error extracting folder ID: {e}")
                    else:
                        all_links.append(link)
                        print(f"{link}")
            else:
                print("No Google Drive links found.")

            print(f"Total files: {self.file_count}")

            # Save links to file
            if all_links:
                self.save_links_to_file(all_links)

            return all_links

def main():
    # URL of the webpage
    url = "https://www.bostonpublicschools.org/domain/1884"

    # Create scanner instance and run scan
    scanner = DriveScanner()
    scanner.scan_website(url)

if __name__ == "__main__":
    main()

Google Drive links found:

Fetching contents of folder: https://drive.google.com/drive/folders/1XNOOmnWE4VMQ-I1stRUkAhogGw9t0I5G?usp=drive_link
ACA-18 Attendance Policies & Procedures.pdf: https://drive.google.com/file/d/1Rq7mPps9BwX6Vfjnc9fKh0MoDN21UL1P
AMT-01 Exam School Application and Admissions.pdf: https://drive.google.com/file/d/1sMHkNfYGn7r8VGEVCToGula73oEtYgwN/view?usp=drive_link
AMT-03 DYS Committed Students.pdf: https://drive.google.com/file/d/1hEpcnEnD17bfEOGhuAgeY1G-aMUctJib/view?usp=drive_link
AMT-04 Grade Requirements.pdf: https://drive.google.com/file/d/1XJrumxGVBFNxG__pHfTKPYI6uQwuf8G5/view?usp=drive_link
AMT-05 Maximum Age Assignment and Enrollment Policy.pdf: https://drive.google.com/file/d/1xB0sAawkLC1HAhM6rbqFzh31CF_5ro03/view?usp=drive_link
AMT-06  Voluntary Transfer Policy.pdf: https://drive.google.com/file/d/1fgaUp4Pn374_lBuJaW1mBZG_JUnWUGy_/view?usp=drive_link
AMT-07 Safety Transfer Request.pdf: https://drive.google.com/file/d/158Utqa8XW8gth30_BexWoRielUKJuke9/