<a href="https://colab.research.google.com/github/Dotunbey/Branham-ai/blob/main/Data_extracting.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

import os
import requests
import re
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from google.colab import drive

# --- 1. CONNECT TO GOOGLE DRIVE (Essential for 1000+ files) ---
# This will pop up a window asking for permission.
drive.mount('/content/drive')

# This creates a folder in your Google Drive called "Branham_Sermons"
# You will find the files there immediately.
folder_name = "/content/drive/My Drive/Branham_Sermons"

if not os.path.exists(folder_name):
    os.mkdir(folder_name)

# --- CONFIGURATION ---
url = "https://end-time-message.org/wm-branham-sermons-pdf/400-william-branham-sermons-1958-pdf"

headers = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
    "Referer": "https://end-time-message.org/",
}

print(f"Scanning page: {url}")

try:
    # Get the list of links
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, "html.parser")

    # Find all links that contain 'view=download' (The pattern you gave me)
    download_links = soup.select("a[href*='view=download']")

    print(f"Found {len(download_links)} potential download links. Starting download...")

    count = 0

    for link in download_links:
        href = link.get('href')
        full_url = urljoin(url, href)

        try:
            # Request the file headers first (stream=True) to get the filename
            with requests.get(full_url, headers=headers, stream=True) as r:
                r.raise_for_status()

                # --- MAGIC: EXTRACT REAL FILENAME FROM HEADER ---
                # The server sends a header like: attachment; filename="Sermon.pdf"
                content_disposition = r.headers.get('content-disposition')

                if content_disposition:
                    # Regex to find the text inside filename="..."
                    fname_match = re.findall('filename="?([^"]+)"?', content_disposition)
                    if fname_match:
                        filename = fname_match[0]
                    else:
                        # Fallback if regex fails
                        filename = f"document_{count}.pdf"
                else:
                    # Fallback if header is missing
                    filename = f"document_{count}.pdf"

                # Clean filename (fix encoded spaces if any)
                filename = filename.replace("%20", " ")
                save_path = os.path.join(folder_name, filename)

                # Check if we already downloaded it (Save time)
                if os.path.exists(save_path):
                    print(f"Skipping (Already exists): {filename}")
                    continue

                print(f"[{count+1}/{len(download_links)}] Downloading: {filename}")

                # Save the file
                with open(save_path, 'wb') as f:
                    for chunk in r.iter_content(chunk_size=8192):
                        f.write(chunk)

                count += 1

        except Exception as e:
            print(f"  Error downloading link {count}: {e}")

    print(f"\nCompleted! Downloaded {count} files to your Google Drive folder: 'Branham_Sermons'")

except Exception as e:
    print(f"Critical Error: {e}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Scanning page: https://end-time-message.org/wm-branham-sermons-pdf/400-william-branham-sermons-1958-pdf
Found 172 potential download links. Starting download...
[1/172] Downloading: 58_0105_Have_Faith_In_God.PDF
Skipping (Already exists): 58_0105_Have_Faith_In_God.PDF
[2/172] Downloading: 58_0107_The_Queen_Of_Sheba.PDF
Skipping (Already exists): 58_0107_The_Queen_Of_Sheba.PDF
[3/172] Downloading: 58_0108_The_Handwriting_On_The_Wall.PDF
Skipping (Already exists): 58_0108_The_Handwriting_On_The_Wall.PDF
[4/172] Downloading: 58_0109_The_Called_Out.PDF
Skipping (Already exists): 58_0109_The_Called_Out.PDF
[5/172] Downloading: 58_0110_The_Mighty_Conqueror.PDF
Skipping (Already exists): 58_0110_The_Mighty_Conqueror.PDF
[6/172] Downloading: 58_0112A_His_Wonders_To_Perform.PDF
Skipping (Already exists): 58_0112A_His_Wonders_To_Perform.PDF
[7/172] Downloading: 58_0112