In [1]:
!apt-get -yq update >/dev/null
!apt-get -yq install p7zip-full >/dev/null  # for .7z integrity tests
!apt-get -y install ffmpeg

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
ffmpeg is already the newest version (7:4.4.2-0ubuntu0.22.04.1).
0 upgraded, 0 newly installed, 0 to remove and 41 not upgraded.


In [2]:
import subprocess
import requests
from requests.auth import HTTPBasicAuth
from bs4 import BeautifulSoup
from tqdm.notebook import tqdm
from pathlib import Path
import pandas as pd
import shutil

In [3]:
AUTH_PAGE_URL = "https://repo.octopus-intelligence.org/auth_message.html"
START_URL = "https://repo.octopus-intelligence.org/browse/"
USERNAME = "" # Contact project administrator for Dataset Username
PASSWORD = "" # Contact project administrator for Dataset Password
TIMEOUT = 60
DEEP_VIDEO_CHECK = False

In [4]:
CACHE_DOWNLOAD_ROOT = Path("/content/octopus_repo")
DRIVE_TARGET = Path("/content/drive/MyDrive/octopus_repo")

CACHE_DOWNLOAD_ROOT.mkdir(parents=True, exist_ok=True)
DRIVE_TARGET.mkdir(parents=True, exist_ok=True)

In [5]:
session = requests.Session()
session.auth = HTTPBasicAuth(USERNAME, PASSWORD)
session.headers.update({"User-Agent": "ColabCrawler/1.0"})

def check(url):
    r = session.get(url, timeout=TIMEOUT, allow_redirects=True)
    return r

r = check(AUTH_PAGE_URL)
if r.status_code in (401, 403):
    raise RuntimeError("Authentication failed on AUTH_PAGE_URL (401/403). Check USERNAME/PASSWORD.")
elif r.status_code >= 400:
    raise RuntimeError(f"Error hitting AUTH_PAGE_URL: {r.status_code}")
print("Auth splash reachable")

r2 = check(START_URL)
if r2.status_code in (401, 403):
    raise RuntimeError("Authenticated, but START_URL is unauthorized (401/403). Point START_URL to a directory you can access.")
elif r2.status_code >= 400:
    raise RuntimeError(f"Error hitting START_URL: {r2.status_code}")
print("START_URL reachable")

Auth splash reachable
START_URL reachable


In [6]:
mp4_url_list = [START_URL,]
seen = 0
while len(mp4_url_list)>seen:
    step = mp4_url_list[seen]
    if not step.endswith("/"):
        seen+=1
        continue
    else:
      soup = BeautifulSoup(session.get(step).text, 'html.parser')
      # print(soup.prettify())
      for link in soup.find_all('a'):
        if link.get('href') not in ("browse/", "./", "../"):
          mp4_url_list.append(step+link.get('href'))
      mp4_url_list.remove(step)

In [7]:
def is_video_valid(temp_location: Path, deep: bool = False, timeout: int = 60) -> bool:
    """
    Check if a video at temp_location is valid/corrupted.

    Args:
        temp_location (Path): Path to the video file.
        deep (bool): If False → quick check with ffprobe (fast).
                     If True  → full decode with ffmpeg (slow, but thorough).
        timeout (int): Max seconds to let the check run.

    Returns:
        bool: True if video is valid, False if corrupted.
    """
    fp = str(temp_location)

    try:
        if not deep:
            cmd = [
                "ffprobe", "-v", "error",
                "-select_streams", "v:0",
                "-show_entries", "stream=codec_name",
                "-of", "default=noprint_wrappers=1:nokey=1",
                fp
            ]
            res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                                 text=True, timeout=timeout)
            return res.returncode == 0 and res.stdout.strip() != ""
        else:
            # Deep check: decode every frame with ffmpeg
            cmd = ["ffmpeg", "-v", "error", "-i", fp, "-f", "null", "-"]
            res = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE,
                                 text=True, timeout=timeout)
            return res.returncode == 0
    except Exception:
        return False


In [8]:
keys = ["url", "checked", "location_saved", "is_corrupted"]
dataset = [(url,False,(Path(url).relative_to(START_URL)), False) for url in mp4_url_list]

data = pd.DataFrame(dataset, columns=keys)
data.to_csv(f"{DRIVE_TARGET}/data.csv", index=False)

In [9]:
data = pd.read_csv(f"{DRIVE_TARGET}/data.csv")
for idx, row in tqdm(data.iterrows(), total=len(data)):
  if idx > 1000:
    continue
  url = row["url"]
  checked = bool(row["checked"])
  location_saved = row["location_saved"]
  temp_location = CACHE_DOWNLOAD_ROOT / location_saved
  final_location = DRIVE_TARGET / location_saved
  if final_location.exists():
    data.loc[idx, "is_corrupted"] = not is_video_valid(final_location) if final_location.suffix == ".mp4" else False
    data.loc[idx, "checked"] = True
    data.to_csv(f"{DRIVE_TARGET}/data.csv", index=False)
    continue
  if url.endswith('.mp4') and 'eledone' not in url:
    continue
  elif not checked:
    temp_location.parent.mkdir(parents=True, exist_ok=True)
    file_data = check(url)
    with open(temp_location,'wb') as f:
      f.write(file_data.content)
    data.loc[idx, "checked"] = True
    data.loc[idx, "is_corrupted"] = !is_video_valid(temp_location) if final_location.suffix == ".mp4" else False
    final_location.parent.mkdir(parents=True, exist_ok=True)
    if not data.loc[idx, "is_corrupted"]:
      shutil.move(temp_location, final_location)

  data.to_csv(f"{DRIVE_TARGET}/data.csv", index=False)

data.head()

  0%|          | 0/10948 [00:00<?, ?it/s]

Unnamed: 0,url,checked,location_saved,is_corrupted
0,https://repo.octopus-intelligence.org/browse/a...,True,auth_message.html,False
1,https://repo.octopus-intelligence.org/browse/d...,True,directory%20names%20x%20camera%20names.png,False
2,https://repo.octopus-intelligence.org/browse/O...,True,O%20eledone%202024-10_2024-11/Right%20back/202...,False
3,https://repo.octopus-intelligence.org/browse/O...,True,O%20eledone%202024-10_2024-11/Right%20back/202...,False
4,https://repo.octopus-intelligence.org/browse/O...,True,O%20eledone%202024-10_2024-11/Right%20back/202...,False


In [10]:
data = pd.read_csv(f"{DRIVE_TARGET}/data.csv")
filtered = data[
    data["location_saved"].str.endswith('.mp4') &
    (~data["is_corrupted"]) &
    (data["checked"])
]
filtered.head()

Unnamed: 0,url,checked,location_saved,is_corrupted
2,https://repo.octopus-intelligence.org/browse/O...,True,O%20eledone%202024-10_2024-11/Right%20back/202...,False
3,https://repo.octopus-intelligence.org/browse/O...,True,O%20eledone%202024-10_2024-11/Right%20back/202...,False
4,https://repo.octopus-intelligence.org/browse/O...,True,O%20eledone%202024-10_2024-11/Right%20back/202...,False
5,https://repo.octopus-intelligence.org/browse/O...,True,O%20eledone%202024-10_2024-11/Right%20back/202...,False
6,https://repo.octopus-intelligence.org/browse/O...,True,O%20eledone%202024-10_2024-11/Right%20back/202...,False
