<a href="https://colab.research.google.com/github/AdarshSRM/FirstSite/blob/main/Wayback.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# @title Wayback Machine Media Downloader with Fallback Recovery
import os
import requests
import time
import zipfile
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import ipywidgets as widgets
from IPython.display import display, clear_output
from google.colab import files

def get_fallback_snapshots(original_url):
    """Queries the CDX API for all successful captures of a specific file."""
    cdx_url = "http://web.archive.org/cdx/search/cdx"
    params = {
        "url": original_url,
        "output": "json",
        "fl": "timestamp,statuscode",
        "filter": "statuscode:200"
    }
    try:
        r = requests.get(cdx_url, params=params, timeout=10)
        if r.status_code == 200:
            data = r.json()
            if len(data) > 1:
                # Return timestamps in reverse order (newest first)
                return [row[0] for row in data[1:]][::-1]
    except:
        pass
    return []

def download_images(target_url):
    print(f"Scraping: {target_url}\n")
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) Chrome/91.0.4472.124 Safari/537.36'}

    try:
        response = requests.get(target_url, headers=headers)
        response.raise_for_status()
        soup = BeautifulSoup(response.text, 'html.parser')

        links = soup.find_all('a', href=True)
        media_entries = []

        for link in links:
            href = link['href']
            if '/media3/' in href and any(ext in href.lower() for ext in ['.jpg', '.jpeg', '.png', '.mp4']):
                full_url = urljoin(target_url, href)
                if full_url not in [m['url'] for m in media_entries]:
                    # Extract original source URL from the Wayback wrapper
                    parts = full_url.split('/http')
                    original_src = 'http' + parts[1] if len(parts) > 1 else full_url
                    media_entries.append({'url': full_url, 'original_src': original_src})

        if not media_entries:
            print("No high-res media links found.")
            return

        folder_name = "images"
        if not os.path.exists(folder_name): os.makedirs(folder_name)

        downloaded_paths = []
        failed_urls = []
        recovered_count = 0

        for item in media_entries:
            url = item['url']
            filename = os.path.basename(urlparse(url).path)
            success = False

            # Try primary download
            try:
                r = requests.get(url, headers=headers, stream=True, timeout=10)
                if r.status_code == 200:
                    success = True
                else:
                    print(f"⚠️ Primary link failed ({r.status_code}) for {filename}. Checking history...")
            except:
                print(f"⚠️ Connection error for {filename}. Checking history...")

            # Fallback Logic: Search previous captures
            if not success:
                history = get_fallback_snapshots(item['original_src'])
                for timestamp in history:
                    fallback_url = f"https://web.archive.org/web/{timestamp}/{item['original_src']}"
                    try:
                        r = requests.get(fallback_url, headers=headers, stream=True, timeout=10)
                        if r.status_code == 200:
                            url = fallback_url
                            success = True
                            recovered_count += 1
                            print(f"✅ Recovered {filename} from snapshot: {timestamp}")
                            break
                    except:
                        continue

            if success:
                filepath = os.path.join(folder_name, filename)
                with open(filepath, 'wb') as f:
                    for chunk in r.iter_content(1024): f.write(chunk)
                downloaded_paths.append(filepath)
                if not any(x in locals() for x in ['recovered_count']): print(f"Downloaded: {filename}")
            else:
                failed_urls.append(url)

        # Zip it up
        if downloaded_paths:
            zip_path = "/content/images.zip"
            with zipfile.ZipFile(zip_path, 'w') as zipf:
                for file in downloaded_paths:
                    zipf.write(file, os.path.join("images", os.path.basename(file)))
            print(f"\n--- Process Complete ---")
            print(f"Successfully downloaded: {len(downloaded_paths)} (Recovered from history: {recovered_count})")
            files.download(zip_path)

        if failed_urls:
            print("\n" + "!"*10 + " FAILED URLS " + "!"*10)
            for f_url in failed_urls: print(f_url)

    except Exception as e:
        print(f"Error: {e}")

# UI Code
url_input = widgets.Text(value='https://web.archive.org/web/20150712230220/http://www.notiblog.com/soledad-cescato-prefiere-trio-y-partuza', layout=widgets.Layout(width='80%'))
button = widgets.Button(description='Download Media', button_style='success')
output = widgets.Output()
def run(b):
    with output:
        clear_output()
        download_images(url_input.value)
button.on_click(run)
display(url_input, button, output)