In [3]:
import os
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

BASE_URL = "https://source.coop/radiantearth/cloud-cover-detection-challenge/final/public/train_labels"
OUTPUT_DIR = "train_labels_downloaded"


os.makedirs(OUTPUT_DIR, exist_ok=True)

def list_links(url):
    """Devuelve todos los enlaces de una página."""
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    links = [a["href"] for a in soup.find_all("a", href=True)]
    return links

# 1. Listar carpetas dentro de train_features
folders = [f for f in list_links(BASE_URL) if f.endswith("/")]

print(f"Encontradas {len(folders)} carpetas de chips.")

for folder in tqdm(folders, desc="Descargando carpetas"):
    folder_url = f"{BASE_URL}/{folder}"
    folder_path = os.path.join(OUTPUT_DIR, folder.replace("/", ""))
    os.makedirs(folder_path, exist_ok=True)

    # 2. Listar archivos dentro de cada carpeta
    files = [f for f in list_links(folder_url) if f.endswith(".tif")]

    for file in files:
        file_url = f"{folder_url}/{file}"
        file_path = os.path.join(folder_path, file)

        if os.path.exists(file_path):
            continue

        # 3. Descargar archivo
        with requests.get(file_url, stream=True) as r:
            r.raise_for_status()
            with open(file_path, "wb") as f:
                for chunk in r.iter_content(chunk_size=8192):
                    f.write(chunk)


Encontradas 2 carpetas de chips.


Descargando carpetas: 100%|██████████| 2/2 [00:16<00:00,  8.11s/it]


In [7]:
import os
import requests
from bs4 import BeautifulSoup
from tqdm import tqdm

API_URL = "https://source.coop/api/v1/radiantearth/cloud-cover-detection-challenge/final/public/train_features"
BASE_URL = "https://source.coop/radiantearth/cloud-cover-detection-challenge/final/public/train_features"
OUT = "train_features_full"

os.makedirs(OUT, exist_ok=True)

def list_links(url):
    """Devuelve todos los enlaces de una página."""
    r = requests.get(url)
    soup = BeautifulSoup(r.text, "html.parser")
    links = [a["href"] for a in soup.find_all("a", href=True)]
    return links

# 1. Intentar obtener lista desde la API, si falla usar scraping HTML
try:
    resp = requests.get(API_URL)
    resp.raise_for_status()
    
    # Verificar si la respuesta es JSON válido
    try:
        data = resp.json()
        # Si es una lista de objetos con "path" y "type"
        if isinstance(data, list) and len(data) > 0 and "path" in data[0]:
            files = [f for f in data if f["type"] == "file" and f["path"].endswith(".tif")]
            print(f"Encontrados {len(files)} archivos .tif desde la API")
        else:
            raise ValueError("Formato de API no esperado")
    except ValueError as e:
        print(f"La API no devolvió JSON válido o formato inesperado: {e}")
        print(f"Status code: {resp.status_code}")
        print(f"Content-Type: {resp.headers.get('Content-Type', 'N/A')}")
        print(f"Primeros 500 caracteres de la respuesta: {resp.text[:500]}")
        raise
except (requests.exceptions.RequestException, ValueError) as e:
    print(f"Error al acceder a la API: {e}")
    print("Usando método alternativo: scraping HTML...")
    
    # Método alternativo: usar scraping HTML como en la celda anterior
    folders = [f for f in list_links(BASE_URL) if f.endswith("/")]
    print(f"Encontradas {len(folders)} carpetas.")
    
    files = []
    for folder in tqdm(folders, desc="Listando archivos"):
        folder_url = f"{BASE_URL}/{folder}"
        folder_files = [f for f in list_links(folder_url) if f.endswith(".tif")]
        for file in folder_files:
            files.append({
                "path": f"{folder}{file}",
                "type": "file"
            })
    
    print(f"Encontrados {len(files)} archivos .tif mediante scraping HTML")

# 2. Descargar cada archivo
for f in tqdm(files, desc="Descargando archivos"):
    if isinstance(f, dict):
        file_path_rel = f["path"]
    else:
        file_path_rel = f
    
    file_url = f"{BASE_URL}/{file_path_rel}"
    local_path = os.path.join(OUT, file_path_rel)

    os.makedirs(os.path.dirname(local_path), exist_ok=True)

    if not os.path.exists(local_path):
        r = requests.get(file_url, stream=True)
        r.raise_for_status()
        with open(local_path, "wb") as fp:
            for chunk in r.iter_content(chunk_size=8192):
                fp.write(chunk)


La API no devolvió JSON válido o formato inesperado: Expecting value: line 1 column 1 (char 0)
Status code: 200
Content-Type: text/html; charset=utf-8
Primeros 500 caracteres de la respuesta: <!DOCTYPE html><html lang="en"><head><meta charSet="utf-8"/><meta name="viewport" content="width=device-width, initial-scale=1"/><link rel="stylesheet" href="/_next/static/css/1e5913857760d7aa.css?dpl=dpl_8jQWPjTKwo8ZPwQi3Q1LDvYUm6dy" data-precedence="next"/><link rel="stylesheet" href="/_next/static/css/fdbd78beb3744130.css?dpl=dpl_8jQWPjTKwo8ZPwQi3Q1LDvYUm6dy" data-precedence="next"/><link rel="stylesheet" href="/_next/static/css/7abd75a0e14f04fb.css?dpl=dpl_8jQWPjTKwo8ZPwQi3Q1LDvYUm6dy" data-
Error al acceder a la API: Expecting value: line 1 column 1 (char 0)
Usando método alternativo: scraping HTML...
Encontradas 2 carpetas.


Listando archivos: 100%|██████████| 2/2 [00:07<00:00,  3.87s/it]


Encontrados 0 archivos .tif mediante scraping HTML


Descargando archivos: 0it [00:00, ?it/s]


In [None]:
import os
import time
import urllib.parse as urlparse

import requests
from bs4 import BeautifulSoup

BASE_URL = "https://source.coop/radiantearth/cloud-cover-detection-challenge/final/public"
OUTPUT_DIR = "descargas"

os.makedirs(OUTPUT_DIR, exist_ok=True)

session = requests.Session()
session.headers.update({
    "User-Agent": "Mozilla/5.0 (compatible; YourBot/0.1; +https://example.com/bot)"
})

def get_soup(url, retries=3, backoff=2):
    for i in range(retries):
        try:
            r = session.get(url, timeout=20)
            r.raise_for_status()
            return BeautifulSoup(r.text, "html.parser")
        except Exception as e:
            if i == retries - 1:
                raise
            time.sleep(backoff * (i + 1))

def is_same_domain(url, base_domain):
    parsed = urlparse.urlparse(url)
    return parsed.netloc == "" or parsed.netloc == base_domain

def download_file(url, root_url, output_dir):
    rel_path = urlparse.urlparse(url).path.replace(urlparse.urlparse(root_url).path, "", 1)
    rel_path = rel_path.lstrip("/")
    if not rel_path:
        return

    local_path = os.path.join(output_dir, rel_path)
    os.makedirs(os.path.dirname(local_path), exist_ok=True)

    if os.path.exists(local_path):
        return

    with session.get(url, stream=True, timeout=60) as r:
        r.raise_for_status()
        with open(local_path, "wb") as f:
            for chunk in r.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)

def crawl(url, root_url, output_dir, visited, file_exts=(".tif", ".tiff")):
    if url in visited:
        return
    visited.add(url)

    soup = get_soup(url)
    base_domain = urlparse.urlparse(root_url).netloc

    for a in soup.find_all("a", href=True):
        href = a["href"]
        full = urlparse.urljoin(url, href)

        if not is_same_domain(full, base_domain):
            continue

        path = urlparse.urlparse(full).path

        if any(path.lower().endswith(ext) for ext in file_exts):
            download_file(full, root_url, output_dir)
        else:
            # Heurística simple de “directorio”: termina en “/”
            if path.endswith("/"):
                crawl(full, root_url, output_dir, visited, file_exts=file_exts)

if __name__ == "__main__":
    visited = set()
    crawl(BASE_URL, BASE_URL, OUTPUT_DIR, visited)


In [15]:
import boto3
import os
from tqdm import tqdm
from botocore import UNSIGNED
from botocore.config import Config

OUTPUT_DIR = "train_features_downloaded"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Cliente S3 público (no necesita credenciales)
s3 = boto3.client(
    's3',
    config=Config(signature_version=UNSIGNED),
    region_name='us-west-2'
)

BUCKET = "aws-opendata-us-west-2"
PREFIX = "ref_cloud_cover_detection_challenge_v1/final/public/train_features/"

def download_s3_folder():
    paginator = s3.get_paginator('list_objects_v2')
    
    # Lista todos los objetos
    pages = paginator.paginate(Bucket=BUCKET, Prefix=PREFIX)
    
    tif_files = []
    for page in pages:
        if 'Contents' in page:
            for obj in page['Contents']:
                key = obj['Key']
                if key.endswith('.tif'):
                    tif_files.append(key)
    
    print(f"Encontrados {len(tif_files)} archivos .tif")
    
    # Descarga cada archivo
    for key in tqdm(tif_files, desc="Descargando"):
        local_path = os.path.join(OUTPUT_DIR, os.path.relpath(key, PREFIX))
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        
        if os.path.exists(local_path):
            continue
            
        s3.download_file(BUCKET, key, local_path)
    
    print("¡Descarga completada!")

if __name__ == "__main__":
    download_s3_folder()


NoSuchBucket: An error occurred (NoSuchBucket) when calling the ListObjectsV2 operation: The specified bucket does not exist

In [14]:
import boto3
from botocore import UNSIGNED
from botocore.config import Config

BUCKET = "aws-opendata-us-west-2"
PREFIX = "ref_cloud_cover_detection_challenge_v1/final/public/train_features/"

s3 = boto3.client(
    "s3",
    config=Config(signature_version=UNSIGNED),
    region_name="us-west-2",
)

try:
    resp = s3.list_objects_v2(Bucket=BUCKET, Prefix=PREFIX, MaxKeys=1)
    print("OK, respuesta S3:")
    print(resp.get("KeyCount"), "objetos encontrados (primer page)")
except Exception as e:
    # Mostrar información detallada del error
    print("Error al llamar a S3:")
    print(repr(e))
    if hasattr(e, "response"):
        print("Detalle de error S3:", e.response.get("Error"))

Error al llamar a S3:
NoSuchBucket('An error occurred (NoSuchBucket) when calling the ListObjectsV2 operation: The specified bucket does not exist')
Detalle de error S3: {'Code': 'NoSuchBucket', 'Message': 'The specified bucket does not exist', 'BucketName': 'aws-opendata-us-west-2'}
